Skip to content

Commit

Permalink
feat: 支持上传html格式的文档 #364 (#518)
Browse files Browse the repository at this point in the history
  • Loading branch information
shaohuzhang1 committed May 23, 2024
1 parent d9e171c commit 86f5002
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 4 deletions.
66 changes: 66 additions & 0 deletions apps/common/handle/impl/html_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: html_split_handle.py
@date:2024/5/23 10:58
@desc:
"""
import re
from typing import List

from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel

default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]


def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']


class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
buffer = get_buffer(file)
file_name: str = file.name.lower()
if file_name.endswith(".html"):
return True
result = detect(buffer)
if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
result['confidence'] > 0.5:
return True
return False

def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)

if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
3 changes: 2 additions & 1 deletion apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs
from common.exception.app_exception import AppApiException
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.mixins.api_mixin import ApiMixin
Expand Down Expand Up @@ -772,7 +773,7 @@ def get_buffer(self, file):


default_split_handle = TextSplitHandle()
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle]


def save_image(image_list):
Expand Down
2 changes: 1 addition & 1 deletion ui/src/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export function getImgUrl(name: string) {
}
// 是否是白名单后缀
export function isRightType(name: string) {
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md']
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html']
return typeList.includes(fileType(name))
}

Expand Down
6 changes: 4 additions & 2 deletions ui/src/views/dataset/component/UploadComponent.vue
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
action="#"
:auto-upload="false"
:show-file-list="false"
accept=".txt, .md, .csv, .log, .docx, .pdf"
accept=".txt, .md, .csv, .log, .docx, .pdf, .html"
:limit="50"
:on-exceed="onExceed"
:on-change="fileHandleChange"
Expand All @@ -31,7 +31,9 @@
<em class="hover" @click.prevent="handlePreview(true)"> 选择文件夹 </em>
</p>
<div class="upload__decoration">
<p>支持格式:TXT、Markdown、PDF、DOCX,每次最多上传50个文件,每个文件不超过 100MB</p>
<p>
支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB
</p>
<p>若使用【高级分段】建议上传前规范文件的分段标识</p>
</div>
</div>
Expand Down

0 comments on commit 86f5002

Please sign in to comment.