feat: 支持上传html格式的文档 #364 (#518)

1Panel-dev · May 23, 2024 · 86f5002 · 86f5002
1 parent d9e171c
commit 86f5002
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 4 deletions.
diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： html_split_handle.py
+    @date：2024/5/23 10:58
+    @desc:
+"""
+import re
+from typing import List
+
+from bs4 import BeautifulSoup
+from charset_normalizer import detect
+from html2text import html2text
+
+from common.handle.base_split_handle import BaseSplitHandle
+from common.util.split_model import SplitModel
+
+default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
+                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
+                        re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
+                        re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
+                        re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
+                        re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
+
+
+def get_encoding(buffer):
+    beautiful_soup = BeautifulSoup(buffer, "html.parser")
+    meta_list = beautiful_soup.find_all('meta')
+    charset_list = [meta.attrs.get('charset') for meta in meta_list if
+                    meta.attrs is not None and 'charset' in meta.attrs]
+    if len(charset_list) > 0:
+        charset = charset_list[0]
+        return charset
+    return detect(buffer)['encoding']
+
+
+class HTMLSplitHandle(BaseSplitHandle):
+    def support(self, file, get_buffer):
+        buffer = get_buffer(file)
+        file_name: str = file.name.lower()
+        if file_name.endswith(".html"):
+            return True
+        result = detect(buffer)
+        if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
+                result['confidence'] > 0.5:
+            return True
+        return False
+
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        buffer = get_buffer(file)
+
+        if pattern_list is not None and len(pattern_list) > 0:
+            split_model = SplitModel(pattern_list, with_filter, limit)
+        else:
+            split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
+        try:
+            encoding = get_encoding(buffer)
+            content = buffer.decode(encoding)
+            content = html2text(content)
+        except BaseException as e:
+            return {'name': file.name,
+                    'content': []}
+        return {'name': file.name,
+                'content': split_model.parse(content)
+                }
diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py
@@ -25,6 +25,7 @@
 from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs
 from common.exception.app_exception import AppApiException
 from common.handle.impl.doc_split_handle import DocSplitHandle
+from common.handle.impl.html_split_handle import HTMLSplitHandle
 from common.handle.impl.pdf_split_handle import PdfSplitHandle
 from common.handle.impl.text_split_handle import TextSplitHandle
 from common.mixins.api_mixin import ApiMixin
@@ -772,7 +773,7 @@ def get_buffer(self, file):
 
 
 default_split_handle = TextSplitHandle()
-split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
+split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle]
 
 
 def save_image(image_list):

diff --git a/ui/src/utils/utils.ts b/ui/src/utils/utils.ts
@@ -43,7 +43,7 @@ export function getImgUrl(name: string) {
 }
 // 是否是白名单后缀
 export function isRightType(name: string) {
-  const typeList = ['txt', 'pdf', 'docx', 'csv', 'md']
+  const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html']
   return typeList.includes(fileType(name))
 }
 

diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue
@@ -17,7 +17,7 @@
         action="#"
         :auto-upload="false"
         :show-file-list="false"
-        accept=".txt, .md, .csv, .log, .docx, .pdf"
+        accept=".txt, .md, .csv, .log, .docx, .pdf, .html"
         :limit="50"
         :on-exceed="onExceed"
         :on-change="fileHandleChange"
@@ -31,7 +31,9 @@
             <em class="hover" @click.prevent="handlePreview(true)"> 选择文件夹 </em>
           </p>
           <div class="upload__decoration">
-            <p>支持格式：TXT、Markdown、PDF、DOCX，每次最多上传50个文件，每个文件不超过 100MB</p>
+            <p>
+              支持格式：TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件，每个文件不超过 100MB
+            </p>
             <p>若使用【高级分段】建议上传前规范文件的分段标识</p>
           </div>
         </div>