-
Notifications
You must be signed in to change notification settings - Fork 353
/
web.py
31 lines (28 loc) · 1.22 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from typing import List
import re
from langchain.docstore.document import Document
from langchain_community.document_loaders import WebBaseLoader
from .document_loading_base import DocumentLoadingBase
from ..common.source_document import SourceDocument
class WebDocumentLoading(DocumentLoadingBase):
def __init__(self) -> None:
super().__init__()
def load(self, document_url: str) -> List[SourceDocument]:
documents: List[Document] = WebBaseLoader(document_url).load()
for document in documents:
document.page_content = re.sub("\n{3,}", "\n\n", document.page_content)
# Remove half non-ascii character from start/end of doc content
pattern = re.compile(
r"[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]"
)
document.page_content = re.sub(pattern, "", document.page_content)
if document.page_content == "":
documents.remove(document)
source_documents: List[SourceDocument] = [
SourceDocument(
content=document.page_content,
source=document.metadata["source"],
)
for document in documents
]
return source_documents