# Install Requirements

Use:
```bash
pip install pdfplumber python-docx sentence-transformers langchain_huggingface langchain_community transformers numpy pandas tqdm chromadb
```

# Make dataset from documents

### Imports

In [None]:
import os
from tqdm import tqdm
import pandas as pd
import pdfplumber
from docx import Document

### Functions

In [None]:
def find_files_recursively(directory: str, extensions: list) -> list:
    """
    Recursively find all files in the given directory with specified extensions.
    Args:
        directory (str): Directory to search.
        extensions (list): List of file extensions (e.g., ['.pdf', '.docx']).
    Returns:
        List of file paths matching the extensions.
    """
    files = []
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if any(filename.lower().endswith(ext) for ext in extensions):
                files.append(os.path.join(root, filename))
    return files

def extract_pdf_text(file_path: str) -> str:
    """
    Extract text from a PDF file using pdfplumber.
    Args:
        file_path (str): Path to the PDF file.
    Returns:
        Extracted text, or empty string if extraction fails.
    """
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def extract_docx_text(file_path: str) -> str:
    """
    Extract text from a DOCX file using python-docx.
    Args:
        file_path (str): Path to the DOCX file.
    Returns:
        Extracted text, or empty string if extraction fails.
    """
    try:
        doc = Document(file_path)
        text = "\n".join(para.text for para in doc.paragraphs)
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def read_text_file(file_path: str) -> str:
    """
    Read content from a TXT or MD file.
    Args:
        file_path (str): Path to the TXT or MD file.
    Returns:
        File content, or empty string if reading fails.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def convert_file(file_path: str, output_format: str = "txt") -> str:
    """
    Convert a single file to the specified format using simpler tools.
    Args:
        file_path (str): Path to the file to convert.
        output_format (str): Desired output format ('txt' for plain text).
    Returns:
        Extracted content, or empty string if conversion fails.
    """
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return extract_pdf_text(file_path)
    elif ext == ".docx":
        return extract_docx_text(file_path)
    elif ext in [".txt", ".md"]:
        return read_text_file(file_path)
    else:
        print(f"Unsupported file extension for {file_path}")
        return ""

def process_files(data_dir: str, output_pickle: str, extensions: list, output_format: str = "txt"):
    """
    Process all files in the data directory, extract text, and save to a pickled DataFrame.
    Args:
        data_dir (str): Directory containing the files to process.
        output_pickle (str): Path to the output pickled DataFrame file.
        extensions (list): List of file extensions to process (e.g., ['.pdf', '.docx', '.txt', '.md']).
        output_format (str): Output format ('txt' for plain text).
    """
    # Find all files recursively
    files = find_files_recursively(data_dir, extensions)
    print(f"Found {len(files)} files to process.")

    # Process files with a progress bar
    data = []
    for file in tqdm(files):
        content = convert_file(file, output_format)
        if content:
            data.append({"filename": os.path.basename(file), "content": content})

    # Save to pickled DataFrame
    df = pd.DataFrame(data)
    df['content'] = df['content'].str.strip()
    df['content'] = df['content'].str.replace(r'\n+', ' ', regex=True)

    return df


### Run it!

In [None]:
# Configuration
data_directory = "files"  # Adjust to your data folder
output_pickle_file = os.path.join(data_directory, "processed_files.pkl")
file_extensions = [".pdf", ".docx", ".txt", ".md"]  # Supported extensions

# Run the processing
df = process_files(data_directory, output_pickle_file, file_extensions, output_format="txt")


Found 30 files to process.


100%|██████████| 30/30 [00:01<00:00, 15.74it/s]


### Check the created dataset

In [2]:
print(df.head())


                                            filename  \
0  Uitstel beantwoording vragen van de leden Vedd...   
1                                   Sprekerslijst.md   
2  De uitsluiting van het Afghaanse vrouwen voetb...   
3  Het bericht dat de gemeente Haarlem ouders vra...   
4  Het beëindigen van de VWS-subsidie aan het IK...   

                                             content  
0  2 Tweede Kamer der Staten-Generaal Vergaderjaa...  
1  # Tweede Kamer ## DER STATEN-GENERAAL ### SPRE...  
2  2 Tweede Kamer der Staten-Generaal Vergaderjaa...  
3  2 Tweede Kamer der Staten-Generaal Vergaderjaa...  
4  2025Z06621 (ingezonden 7 april 2025) Vragen va...  


# Run the Vector store and RAG part (ChromaDB)

### Import

In [None]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.vectorstores import Chroma


### Functions

In [None]:
def split_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list:
    """
    Split the text into chunks for embedding.
    Args:
        text (str): The text to split.
        chunk_size (int): Size of each chunk in characters.
        chunk_overlap (int): Overlap between chunks in characters.
    Returns:
        list: List of text chunks.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

def create_vector_store(df: pd.DataFrame, embeddings: str, persist_directory: str = "./chroma_db") -> Chroma:
    """
    Create a ChromaDB vector store from text chunks with metadata.
    Args:
        df (pd.DataFrame): DataFrame containing the document content and filenames.
        embeddings (str): The embeddings model name (e.g., "sentence-transformers/all-MiniLM-L6-v2").
        persist_directory (str): Directory to persist the ChromaDB store.
    Returns:
        Chroma: The Chroma vector store instance.
    """
    # Create the embeddings model
    embeddings_model = HuggingFaceEmbeddings(model_name=embeddings)

    # Split text into chunks per document and collect metadata
    chunks = []
    metadatas = []
    for _, row in df.iterrows():
        doc_chunks = split_text(row["content"])
        chunks.extend(doc_chunks)
        metadatas.extend([{"filename": row["filename"]} for _ in doc_chunks])

    # Create Chroma vector store from texts and metadata
    vectorstore = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings_model,
        metadatas=metadatas,
        persist_directory=persist_directory
    )

    print(f"ChromaDB vector store created and persisted to {persist_directory}.")
    return vectorstore

def setup_rag_chain(vectorstore) -> RetrievalQA:
    """
    Set up the RAG chain with a ChromaDB-based retriever.
    Args:
        vectorstore (Chroma): The Chroma vector store instance.
    Returns:
        RetrievalQA: The configured RAG chain.
    """
    # Set up the retriever from the Chroma vector store
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

    # Load a small local LLM
    llm_pipeline = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B", device="cpu")
    llm = HuggingFacePipeline(pipeline=llm_pipeline)

    # Create the RAG chain
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Simple concatenation of retrieved docs
        retriever=retriever,
        return_source_documents=True  # Return the source for educational purposes
    )
    return rag_chain

def run_example_query(rag_chain: RetrievalQA, query: str):
    """
    Run an example query on the RAG system.
    Args:
        rag_chain (RetrievalQA): The RAG chain.
        query (str): The query to ask.
    """
    result = rag_chain.invoke({"query": query})
    print(f"Query: {query}")
    print(f"Answer: {result['result']}")
    print("Source Documents:")
    for doc in result['source_documents']:
        print(f"- {doc.page_content[:100]}... (from {doc.metadata['filename']})")
€

### Run it!

In [None]:
# Create the vector store using ChromaDB
embeddings = "sentence-transformers/all-MiniLM-L6-v2"
vectorstore = create_vector_store(df, embeddings, persist_directory="./chroma_db")

# Set up the RAG chain
rag_chain = setup_rag_chain(vectorstore)
print("RAG system set up.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


ChromaDB vector store created and persisted to ./chroma_db.


Device set to use cpu


RAG system set up.


### Run test querys

In [None]:
example_query = "Kan je aangifte doen tegen een agent?"
run_example_query(rag_chain, example_query)


Query: Kan je aangifte doen tegen een agent?
Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

5 Is het waar dat burgers die aangifte willen doen tegen een politiemedewerker vaak worden doorverwezen naar een klachtenprocedure in plaats van dat hun aangifte wordt opgenomen? Zo ja, wordt deze procedure duidelijk gecommu- niceerd aan burgers? 1 BNN-VARA, 1 maart 2025, Kan je aangifte doen tegen een agent? (https://www.bnnvara.nl/ artikelen/kan-je-aangifte-doen-tegen-een-agent-radio-boos). ah-tk-20242025-1735 ISSN 0921 - 7398 ’s-Gravenhage 2025 Tweede Kamer, vergaderjaar 2024–2025,

5 Is het waar dat burgers die aangifte willen doen tegen een politiemedewerker vaak worden doorverwezen naar een klachtenprocedure in plaats van dat hun aangifte wordt opgenomen? Zo ja, wordt deze procedure duidelijk gecommu- niceerd aan burgers? 1 BNN-VARA, 1 maart 2025, Kan je aangifte doe

In [None]:
example_query = "Wat zijn de beperkingsrechten van burgers over hun gezondheidsgegevens volgens de European Health Data Space-Verordening (EHDS), en hoe plant de Nederlandse minister van VWS deze te implementeren in nationale wetgeving?"
run_example_query(rag_chain, example_query)


Query: Wat zijn de beperkingsrechten van burgers over hun gezondheidsgegevens volgens de European Health Data Space-Verordening (EHDS), en hoe plant de Nederlandse minister van VWS deze te implementeren in nationale wetgeving?
Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

brief ‘agenda van databeschikbaarheid in de zorg’ waarmee ik u hierover informeerde.  De European Health Data Space-Verordening (EHDS) draagt bij aan betere zorg door betere databeschikbaarheid. Die elektronische uitwisseling is essentieel, en daarbij ook de zeggenschap die burgers over hun zorg- en persoonsgegevens kunnen uitoefenen. De EHDS geeft burgers een aantal rechten waarmee zij die zeggenschap kunnen vormgeven. Heel belangrijk daarbij zijn beperkingsrechten, zoals de ‘opt-out’. In deze

brief ‘agenda van databeschikbaarheid in de zorg’ waarmee ik u hierover informeerde.  De European He

In [None]:
example_query = "Wat is het opt-out-recht in de EHDS en hoe wil de Nederlandse minister van VWS dit toepassen?"
run_example_query(rag_chain, example_query)


Query: Wat is het opt-out-recht in de EHDS en hoe wil de Nederlandse minister van VWS dit toepassen?
Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

de EHDS, dan ook de mogelijkheid krijgen om zijn beperkingswensen te registeren. Op welke wijze dit kan bespreek ik hieronder. Beperkingsrechten bij primair gebruik De EHDS biedt burgers twee beperkingsrechten: het recht op een opt-out en het recht op toegangsbeperking.  Recht op opt-out wordt in nationale wetgeving neergelegd Of EU-lidstaten het recht op opt-out voor hun eigen burgers mogelijk willen maken, mogen ze zelf bepalen. Doen ze dit, dan moet dit recht in nationale wetgeving worden

de EHDS, dan ook de mogelijkheid krijgen om zijn beperkingswensen te registeren. Op welke wijze dit kan bespreek ik hieronder. Beperkingsrechten bij primair gebruik De EHDS biedt burgers twee beperkingsrechten: het recht op een o

### Cleanup

In [None]:
try:
    del rag_chain
    del vectorstore
except NameError:
    pass
