-
Notifications
You must be signed in to change notification settings - Fork 988
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d9e171c
commit 86f5002
Showing
4 changed files
with
73 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# coding=utf-8 | ||
""" | ||
@project: maxkb | ||
@Author:虎 | ||
@file: html_split_handle.py | ||
@date:2024/5/23 10:58 | ||
@desc: | ||
""" | ||
import re | ||
from typing import List | ||
|
||
from bs4 import BeautifulSoup | ||
from charset_normalizer import detect | ||
from html2text import html2text | ||
|
||
from common.handle.base_split_handle import BaseSplitHandle | ||
from common.util.split_model import SplitModel | ||
|
||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), | ||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'), | ||
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"), | ||
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"), | ||
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"), | ||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")] | ||
|
||
|
||
def get_encoding(buffer): | ||
beautiful_soup = BeautifulSoup(buffer, "html.parser") | ||
meta_list = beautiful_soup.find_all('meta') | ||
charset_list = [meta.attrs.get('charset') for meta in meta_list if | ||
meta.attrs is not None and 'charset' in meta.attrs] | ||
if len(charset_list) > 0: | ||
charset = charset_list[0] | ||
return charset | ||
return detect(buffer)['encoding'] | ||
|
||
|
||
class HTMLSplitHandle(BaseSplitHandle): | ||
def support(self, file, get_buffer): | ||
buffer = get_buffer(file) | ||
file_name: str = file.name.lower() | ||
if file_name.endswith(".html"): | ||
return True | ||
result = detect(buffer) | ||
if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \ | ||
result['confidence'] > 0.5: | ||
return True | ||
return False | ||
|
||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): | ||
buffer = get_buffer(file) | ||
|
||
if pattern_list is not None and len(pattern_list) > 0: | ||
split_model = SplitModel(pattern_list, with_filter, limit) | ||
else: | ||
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) | ||
try: | ||
encoding = get_encoding(buffer) | ||
content = buffer.decode(encoding) | ||
content = html2text(content) | ||
except BaseException as e: | ||
return {'name': file.name, | ||
'content': []} | ||
return {'name': file.name, | ||
'content': split_model.parse(content) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters