In [1]:
import openai
from importlib import reload
from utils.structure_content import process_folder_concurrently,StructuredDocument
from llm_client import llm_client
from llm_client.agent import MultiTurnAgent
from llm_client.prompt_builder import  PromptBuilder
from llm_client.document_vector_store import (
    EmbeddingProcessor,
    Document,
    DocumentStore,
    SimpleDocument,
    VectorStore
)
from kme_doc import KMEDocument
from tqdm import tqdm
import pandas as pd
import os
from content_retrieval import query_document, llm,doc_store, summary_doc_store ,embed,embedding

2025-08-22 09:49:09,681 - INFO - Loading faiss with AVX512 support.
2025-08-22 09:49:09,682 - INFO - Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
2025-08-22 09:49:09,682 - INFO - Loading faiss with AVX2 support.
2025-08-22 09:49:09,693 - INFO - Successfully loaded faiss with AVX2 support.


Loading .env file from: .env


2025-08-22 09:49:11,317 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-03-01-preview "HTTP/1.1 200 OK"


Initialized new FAISS index with dimension 3072.
Syncing VectorStore with DocumentStore...
VectorStore is already in sync. No new documents to add.


In [2]:
kme_vertaaltabel = pd.read_csv("data/kme_vertaaltabel.csv",sep=';').set_index("KME_ID")

### Verwerken van nieuwe content in content map

In [3]:
belastingsoort= kme_vertaaltabel['BELASTINGSOORT'].unique().tolist()
proces = kme_vertaaltabel['PROCES_ONDERWERP'].unique().tolist()

# Opbouwen Doc store van pdf

In [None]:
def check_for_new_content():
    filenames = [(k,v)[1].metadata['filename'] for k,v in doc_store.documents.items()]
    found_filenames = os.listdir('content')
    new_document_filenames = [filename for filename in found_filenames if filename not in filenames and filename.endswith(".pdf")]
    return new_document_filenames

new_filenames = check_for_new_content()
new_content_doc = {}
if new_filenames:
    new_content_doc = process_folder_concurrently('content',max_workers=8,filenames_to_process=new_filenames)

docs = []
for filename ,doc in new_content_doc.items():
    kme_tax = kme_vertaaltabel.loc[doc.km_number]
    metadata = {'filename':filename,
    'datum':doc.date,
    'BELASTINGSOORT':kme_tax['BELASTINGSOORT'],
    'PROCES_ONDERWERP' : kme_tax['PROCES_ONDERWERP'],
    'PRODUCT_SUBONDERWERP' :kme_tax['PRODUCT_SUBONDERWERP'],
    'VRAAG' : kme_tax['VRAAG'],

    }
    new_doc = Document(doc.km_number,doc.title,doc.full_text,metadata)
    docs.append(new_doc)
doc_store.add(docs)

In [20]:
[ (k,v)[1].metadata['filename'] for k,v in doc_store.documents.items()]

['Hoe_kom_ik_aan_een_aangifte_schenkbelasting_KM1008494_v19-0_nl-NL.pdf',
 'KM1006633.pdf',
 'Kan_ik_Mijn_Belastingdienst_en_de_online_aangifte_gebruiken_als_ik_emi-immigreer_KM1014030_v5-0_nl-NL.pdf',
 'KM1004417.pdf',
 'Betaling_niet_of_te_laat_gedaan._KM1001164_v2-0_nl-NL.pdf',
 'Gevolgen_aangifte_niette_laat_ingeleverd._KM1010402_v2-0_nl-NL.pdf',
 '(papieren_starters)_Aangifte_niet_of_te_laat_ingeleverd._KM1004804_v5-0_nl-NL.pdf',
 'Ik_moet_eenmalig_aangifte_doen._KM1014418_v3-0_nl-NL.pdf',
 'KM1004137.pdf',
 'KM1002187.pdf',
 'Ik_wil_een_ander_aangiftebiljet._KM1045050_v1-0_nl-NL.pdf',
 'Kan_ik_digitaal_aangifte_doen_KM1008258_v9-0_nl-NL.pdf',
 'Kan_ik_de_aangifte_erfbelasting_op_papier_doen_KM1000937_v24-0_nl-NL.pdf',
 'Hoe_moet_ik_digitaal_aangifte_doen_KM1041400_v1-0_nl-NL.pdf',
 'Ik_woon_in_het_buitenland_ben_EU-ambtenaar._Welke_aangifte_gebruik_ik_KM1004695_v2-0_nl-NL.pdf',
 'Ik_heb_gezamenlijk_aangifte_gedaan._Wie_krijgt_de_aanslag_KM1014247_v1-0_nl-NL.pdf',
 'Ik_ben_te_laat

In [9]:
len(doc_store.documents)

57

### Samenvatten en segmenteren content

In [None]:
# dit moet async worden in de toekomst
summary_processor = PromptBuilder(template_path='prompt_templates',name='summarize')
processed_docs = []

for doc_id, doc in tqdm(doc_store.documents.items()):
    if doc_id not in summary_doc_store.documents.keys():
        try:
            prompt = summary_processor.create_prompt(document = doc.content,question= doc.metadata['VRAAG'],
                                                    taxonomy_path = [doc.metadata['BELASTINGSOORT'], 
                                                                    doc.metadata['PROCES_ONDERWERP'],
                                                                    doc.metadata['PRODUCT_SUBONDERWERP']])
            output = llm.process(prompt,reasoning_effort='low')
            output_json =output.content
            if summary_processor.verify_json(output_json):
                new_summary = KMEDocument(id = doc_id , title=doc.title,content=output_json['content'],metadata=output_json['metadata'])
                new_summary.metadata.update(doc.metadata)
                new_summary.metadata["full_text"] = doc.content
                processed_docs.append(new_summary)
        except Exception as e:
            continue

if processed_docs:
    summary_doc_store.add(processed_docs)

In [None]:
from content_retrieval import query_document, llm,doc_store, summary_doc_store ,embed,embedding
from llm_client.tools.vector_search_tool import VectorSearchTool
from llm_client.tools.document_shortlist_tool import DocumentShortlistTool # Import the specific tool class
vs_tool = VectorSearchTool(vector_store=embedding)
slt = DocumentShortlistTool()
prompt_processor = PromptBuilder('prompt_templates','search') 
agent = MultiTurnAgent(
    llm_processor=llm,
    prompt_processor=prompt_processor,
    tools=[vs_tool,slt] # Pass the instantiated tool(s)
)

In [None]:
agent.chat(query="Zoek documenten die gaan over het doen van inkomstenbelasting",max_tool_turns=15)

In [None]:
doc_store.documents['KM1002187.pdf'].metadata['VRAAG']