In [1]:
import openai
from importlib import reload
from utils.structure_content import process_folder_concurrently,StructuredDocument
from llm_client import llm_client
from llm_client.agent import MultiTurnAgent
from llm_client.prompt_builder import  PromptBuilder
from llm_client.document_vector_store import (
    EmbeddingProcessor,
    Document,
    DocumentStore,
    SimpleDocument,
    VectorStore
)
from kme_doc import KMEDocument
from tqdm import tqdm
import pandas as pd
import os
from content_retrieval import query_document, llm,doc_store, summary_doc_store ,embed,embedding

2025-08-06 07:37:10,169 - INFO - Loading faiss with AVX512 support.
2025-08-06 07:37:10,209 - INFO - Successfully loaded faiss with AVX512 support.
2025-08-06 07:37:10,217 - INFO - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.


Loaded FAISS index (60 vectors) and ID set from disk.
Syncing VectorStore with DocumentStore...
VectorStore is already in sync. No new documents to add.


In [2]:
kme_vertaaltabel = pd.read_csv("data/kme_vertaaltabel.csv",sep=';').set_index("KME_ID")

### Verwerken van nieuwe content in content map

In [3]:
belastingsoort= kme_vertaaltabel['BELASTINGSOORT'].unique().tolist()
proces = kme_vertaaltabel['PROCES_ONDERWERP'].unique().tolist()

In [4]:
def check_for_new_content():
    document_ids = os.listdir('content')
    new_document_ids = [document_id for document_id in document_ids if document_id not in doc_store.documents.keys() and document_id.endswith(".pdf")]
    return new_document_ids

new_document_ids = check_for_new_content()
new_content_doc = {}
if new_document_ids:
    new_content_doc = process_folder_concurrently('content',max_workers=8,filenames_to_process=new_document_ids)

docs = []
for k ,doc in new_content_doc.items():
    kme_tax = kme_vertaaltabel.loc[doc.km_number]
    metadata = {'km_number':doc.km_number,
    'datum':doc.date,
    'BELASTINGSOORT':kme_tax['BELASTINGSOORT'],
    'PROCES_ONDERWERP' : kme_tax['PROCES_ONDERWERP'],
    'PRODUCT_SUBONDERWERP' :kme_tax['PRODUCT_SUBONDERWERP'],
    'VRAAG' : kme_tax['VRAAG'],

    }
    new_doc = Document(k,doc.title,doc.full_text,metadata)
    docs.append(new_doc)
doc_store.add(docs)

### Samenvatten en segmenteren content

In [5]:
# dit moet async worden in de toekomst
summary_processor = PromptBuilder(template_path='prompt_templates',name='summarize')
processed_docs = []

for k, doc in tqdm(doc_store.documents.items()):
    if k not in summary_doc_store.documents.keys():
        try:
            prompt = summary_processor.create_prompt(document = doc.content,question= doc.metadata['VRAAG'],
                                                    taxonomy_path = [doc.metadata['BELASTINGSOORT'], 
                                                                    doc.metadata['PROCES_ONDERWERP'],
                                                                    doc.metadata['PRODUCT_SUBONDERWERP']])
            output = llm.process(prompt)
            output_json =output.content
            if summary_processor.verify_json(output_json):
                new_summary = KMEDocument(id = k , title=doc.title,content=output_json['content'],metadata=output_json['metadata'])
                new_summary.metadata.update(doc.metadata)
                new_summary.metadata["full_text"] = doc.content
                processed_docs.append(new_summary)
        except Exception as e:
            continue

if processed_docs:
    summary_doc_store.add(processed_docs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 407873.97it/s]


In [15]:
from content_retrieval import query_document, llm,doc_store, summary_doc_store ,embed,embedding
from llm_client.tools.vector_search_tool import VectorSearchTool
from llm_client.tools.document_shortlist_tool import DocumentShortlistTool # Import the specific tool class
vs_tool = VectorSearchTool(vector_store=embedding)
slt = DocumentShortlistTool()
prompt_processor = PromptBuilder('prompt_templates','search') 
agent = MultiTurnAgent(
    llm_processor=llm,
    prompt_processor=prompt_processor,
    tools=[vs_tool,slt] # Pass the instantiated tool(s)
)

In [16]:
agent.chat(query="Zoek documenten die gaan over het doen van inkomstenbelasting",max_tool_turns=15)

2025-08-06 08:04:14,770 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:15,789 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


--- DEBUG: Agent searching for 4 queries: ['het doen van inkomstenbelasting', 'voorbereiding inkomstenbelasting', 'indiening inkomstenbelasting', 'deadlines inkomstenbelasting'] ---


2025-08-06 08:04:16,166 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:16,305 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:16,438 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:16,584 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:18,960 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


[{'document_id': 'KM1006199', 'score': 5}, {'document_id': 'KM1014138', 'score': 5}, {'document_id': 'KM1010402', 'score': 5}, {'document_id': 'KM1012647', 'score': 5}, {'document_id': 'KM1010409', 'score': 4}, {'document_id': 'KM1000947', 'score': 4}, {'document_id': 'KM1010754', 'score': 4}, {'document_id': 'KM1004778', 'score': 4}, {'document_id': 'KM1001947', 'score': 4}, {'document_id': 'KM1002162', 'score': 3}]


2025-08-06 08:04:20,448 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:21,449 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:21,582 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


--- DEBUG: Agent searching for 3 queries: ['online aangifte inkomstenbelasting', 'gebruik online aangifte inkomstenbelasting', 'instructies online inkomstenbelasting doen'] ---


2025-08-06 08:04:21,718 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:21,854 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:23,408 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


[{'document_id': 'KM1004778', 'score': 5}, {'document_id': 'KM1014138', 'score': 5}, {'document_id': 'KM1014030', 'score': 5}, {'document_id': 'KM1014322', 'score': 5}, {'document_id': 'KM1010754', 'score': 5}]


2025-08-06 08:04:25,176 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:31,704 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


--- DEBUG: Agent searching for 3 queries: ['veelvoorkomende fouten aangifte inkomstenbelasting', 'tips inkomstenbelasting doen', 'problemen bij doen van aangifte inkomstenbelasting'] ---


2025-08-06 08:04:32,090 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:32,230 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:32,371 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:34,209 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


[{'document_id': 'KM1010402', 'score': 5}, {'document_id': 'KM1004544', 'score': 4}, {'document_id': 'KM1014481', 'score': 4}]


2025-08-06 08:04:36,381 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-08-06 08:04:41,581 - INFO - HTTP Request: POST https://ino-ai-foundry-kis.openai.azure.com/openai/deployments/gpt-4.1-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


"Ik heb relevante documenten gevonden over het doen van inkomstenbelasting met betrekking tot voorbereiding, indiening en deadlines. Hierin wordt onder andere uitgelegd dat de aangifte inkomstenbelasting meestal online kan worden ingediend vanaf 1 maart via 'Mijn Belastingdienst' of de app, met specifieke deadlines (bijvoorbeeld uiterlijk 1 mei voor P-aangifte). Er is ook informatie over de exacte wijzen van aangifte en gevolgen wanneer men te laat is met indienen, zoals verzuimboetes en belastingrente.\n\nDaarnaast zijn er specifieke documenten over het gebruik van de online aangifte, inclusief online aangifte voor mensen die in het buitenland wonen of emi-immigreren, met de vereiste DigiD en gebruik van speciale aangiftebiljetten (P-, M-biljetten).\n\nOok vond ik informatie over veelvoorkomende fouten en tips, zoals de gevolgen van het te laat indienen van de aangifte, waaronder het niet kunnen aanvragen van uitstel, het moeten betalen van belastingrente en het mogelijk niet tijdig o

In [20]:
doc_store.documents['KM1002187.pdf'].metadata['VRAAG']

'Burger/ondernemer heeft financiele problemen.'