diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py new file mode 100644 index 00000000..564f6db4 --- /dev/null +++ b/apps/common/handle/impl/html_split_handle.py @@ -0,0 +1,66 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: html_split_handle.py + @date:2024/5/23 10:58 + @desc: +""" +import re +from typing import List + +from bs4 import BeautifulSoup +from charset_normalizer import detect +from html2text import html2text + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), + re.compile('(?<=\\n)(? 0: + charset = charset_list[0] + return charset + return detect(buffer)['encoding'] + + +class HTMLSplitHandle(BaseSplitHandle): + def support(self, file, get_buffer): + buffer = get_buffer(file) + file_name: str = file.name.lower() + if file_name.endswith(".html"): + return True + result = detect(buffer) + if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \ + result['confidence'] > 0.5: + return True + return False + + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): + buffer = get_buffer(file) + + if pattern_list is not None and len(pattern_list) > 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + try: + encoding = get_encoding(buffer) + content = buffer.decode(encoding) + content = html2text(content) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index b659ea97..c5f88e33 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -25,6 +25,7 @@ from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs from common.exception.app_exception import AppApiException from common.handle.impl.doc_split_handle import DocSplitHandle +from common.handle.impl.html_split_handle import HTMLSplitHandle from common.handle.impl.pdf_split_handle import PdfSplitHandle from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin @@ -772,7 +773,7 @@ def get_buffer(self, file): default_split_handle = TextSplitHandle() -split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle] +split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle] def save_image(image_list): diff --git a/ui/src/utils/utils.ts b/ui/src/utils/utils.ts index 027b1a67..581a4ec5 100644 --- a/ui/src/utils/utils.ts +++ b/ui/src/utils/utils.ts @@ -43,7 +43,7 @@ export function getImgUrl(name: string) { } // 是否是白名单后缀 export function isRightType(name: string) { - const typeList = ['txt', 'pdf', 'docx', 'csv', 'md'] + const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html'] return typeList.includes(fileType(name)) } diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue index 98a785a9..54181156 100644 --- a/ui/src/views/dataset/component/UploadComponent.vue +++ b/ui/src/views/dataset/component/UploadComponent.vue @@ -17,7 +17,7 @@ action="#" :auto-upload="false" :show-file-list="false" - accept=".txt, .md, .csv, .log, .docx, .pdf" + accept=".txt, .md, .csv, .log, .docx, .pdf, .html" :limit="50" :on-exceed="onExceed" :on-change="fileHandleChange" @@ -31,7 +31,9 @@ 选择文件夹

-

支持格式:TXT、Markdown、PDF、DOCX,每次最多上传50个文件,每个文件不超过 100MB

+

+ 支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB +

若使用【高级分段】建议上传前规范文件的分段标识