-
Notifications
You must be signed in to change notification settings - Fork 304
/
DocumentProcessorHelper.py
88 lines (79 loc) · 3.67 KB
/
DocumentProcessorHelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import logging
from typing import List
from ..integrated_vectorization.AzureSearchIndex import AzureSearchIndex
from ..integrated_vectorization.AzureSearchIndexer import AzureSearchIndexer
from ..integrated_vectorization.AzureSearchDatasource import AzureSearchDatasource
from ..integrated_vectorization.AzureSearchSkillset import AzureSearchSkillset
from .AzureSearchHelper import AzureSearchHelper
from .DocumentLoadingHelper import DocumentLoading, LoadingSettings
from .DocumentChunkingHelper import DocumentChunking, ChunkingSettings
from ..common.SourceDocument import SourceDocument
from .EnvHelper import EnvHelper
from .LLMHelper import LLMHelper
logger = logging.getLogger(__name__)
class Processor(ChunkingSettings, LoadingSettings):
def __init__(
self,
document_type: str,
chunking: ChunkingSettings | None,
loading: LoadingSettings | None,
use_advanced_image_processing: bool,
):
self.document_type = document_type
self.chunking = chunking
self.loading = loading
self.use_advanced_image_processing = use_advanced_image_processing
def __eq__(self, other):
if isinstance(self, other.__class__):
return (
self.document_type == other.document_type
and self.chunking == other.chunking
and self.loading == other.loading
and self.use_advanced_image_processing
== other.use_advanced_image_processing
)
return False
class DocumentProcessor:
def __init__(self):
pass
def process(self, source_url: str, processors: List[Processor]):
vector_store_helper = AzureSearchHelper()
vector_store = vector_store_helper.get_vector_store()
for processor in processors:
if not processor.use_advanced_image_processing:
try:
document_loading = DocumentLoading()
document_chunking = DocumentChunking()
documents: List[SourceDocument] = []
documents = document_loading.load(source_url, processor.loading)
documents = document_chunking.chunk(documents, processor.chunking)
keys = list(map(lambda x: x.id, documents))
documents = [
document.convert_to_langchain_document()
for document in documents
]
return vector_store.add_documents(documents=documents, keys=keys)
except Exception as e:
logger.error(f"Error adding embeddings for {source_url}: {e}")
raise e
else:
logger.warn("Advanced image processing is not supported yet")
def process_using_integrated_vectorisation(self, source_url: str):
env_helper: EnvHelper = EnvHelper()
llm_helper: LLMHelper = LLMHelper()
try:
search_datasource = AzureSearchDatasource(env_helper)
search_datasource.create_or_update_datasource()
search_index = AzureSearchIndex(env_helper, llm_helper)
search_index.create_or_update_index()
search_skillset = AzureSearchSkillset(env_helper)
search_skillset_result = search_skillset.create_skillset()
search_indexer = AzureSearchIndexer(env_helper)
indexer_result = search_indexer.create_or_update_indexer(
env_helper.AZURE_SEARCH_INDEXER_NAME,
skillset_name=search_skillset_result.name,
)
return indexer_result
except Exception as e:
logger.error(f"Error processing {source_url}: {e}")
raise e