# **Step 1 : Installing Library Package**

In [None]:
pip install PyMuPDF transformers torch faiss-cpu

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF, faiss-cpu
Successfully installed PyMuPDF-1.24.11 faiss-cpu-1.9.0


# **Step 2: Mount Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Step 3: Import Libraries and Set Environment Variables**

In [None]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"  # Disable symlink warning

# Importing required libraries
import fitz  # PyMuPDF for PDF text extraction
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import faiss
import numpy as np

# **Step 4: Text Extraction from PDF**

In [None]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()
    return text

# Paths to your 5 PDF files (update paths for all 5 PDFs)
pdf_paths = [
    '/content/drive/MyDrive/engAIge GmbH/BMF_2013_07_24.pdf',
    '/content/drive/MyDrive/engAIge GmbH/BMF_2017_12_06.pdf',
    '/content/drive/MyDrive/engAIge GmbH/BMF_2017_12_21.pdf',
    '/content/drive/MyDrive/engAIge GmbH/BMF_2021_08_12.pdf',
    '/content/drive/MyDrive/engAIge GmbH/BMF_2023_10_05.pdf'
]

# Extract text from each PDF
extracted_texts = [extract_text_from_pdf(pdf) for pdf in pdf_paths]

# **Step 5: Sentence Based Chunking**

In [None]:
import re

# Function to chunk text into sentences (based on sentence splitting)
def chunk_text_into_sentences(text):
    # Split by sentence endings (., !, ?)
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

# Chunk the extracted texts into sentences
chunks = []
for text in extracted_texts:
    chunks.extend(chunk_text_into_sentences(text))

print(f"Total number of sentence chunks: {len(chunks)}")

Total number of sentence chunks: 12957


# **Step 6: Vectorization Using distilbert model and Batch Processing**

In [None]:
# Load a smaller, lighter model (distilbert)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

# Function to vectorize a text sentence using distilbert
def vectorize_text_chunk(chunk):
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to process chunks in smaller batches to reduce memory load
def batch_vectorize(chunks, batch_size=50):
    vectors = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_vectors = [vectorize_text_chunk(chunk) for chunk in batch]
        vectors.extend(batch_vectors)
        print(f"Processed batch {i//batch_size + 1} of {len(chunks)//batch_size + 1}")
    return np.array(vectors)

# Vectorize the chunks in batches
chunk_vectors = batch_vectorize(chunks, batch_size=50)

print(f"Vectorization completed. Shape of vector array: {chunk_vectors.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Processed batch 1 of 260
Processed batch 2 of 260
Processed batch 3 of 260
Processed batch 4 of 260
Processed batch 5 of 260
Processed batch 6 of 260
Processed batch 7 of 260
Processed batch 8 of 260
Processed batch 9 of 260
Processed batch 10 of 260
Processed batch 11 of 260
Processed batch 12 of 260
Processed batch 13 of 260
Processed batch 14 of 260
Processed batch 15 of 260
Processed batch 16 of 260
Processed batch 17 of 260
Processed batch 18 of 260
Processed batch 19 of 260
Processed batch 20 of 260
Processed batch 21 of 260
Processed batch 22 of 260
Processed batch 23 of 260
Processed batch 24 of 260
Processed batch 25 of 260
Processed batch 26 of 260
Processed batch 27 of 260
Processed batch 28 of 260
Processed batch 29 of 260
Processed batch 30 of 260
Processed batch 31 of 260
Processed batch 32 of 260
Processed batch 33 of 260
Processed batch 34 of 260
Processed batch 35 of 260
Processed batch 36 of 260
Processed batch 37 of 260
Processed batch 38 of 260
Processed batch 39 of

# **Step 7: Initialize FAISS Vector Index**

In [None]:
# Initialize FAISS index for L2 distance (Euclidean)
dimension = chunk_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_vectors)

print(f"Total vectors added to FAISS index: {index.ntotal}")

Total vectors added to FAISS index: 12957


# **Step 8: Query Processing and Precise Answer Extraction**

In [None]:
# Function to vectorize a query and search the vector database
def search_query(query, index, k=5):
    query_vector = vectorize_text_chunk(query).reshape(1, -1)
    distances, indices = index.search(query_vector, k)  # distances, indices of top k matches
    return [chunks[i] for i in indices[0]]  # return the most relevant sentences

# Function to extract the most relevant sentences
def extract_precise_answer(relevant_sentences, query):
    # Post-process to find the sentence that best matches the query (simple string matching)
    best_sentence = ""
    max_overlap = 0
    query_words = set(query.lower().split())

    for sentence in relevant_sentences:
        sentence_words = set(sentence.lower().split())
        overlap = len(query_words.intersection(sentence_words))

        if overlap > max_overlap:
            best_sentence = sentence
            max_overlap = overlap

    return best_sentence

# Example queries provided in the task
queries = [
    "Wie hoch ist die Grundzulage?",
    "Wie werden Versorgungsleistungen aus einer Direktzusage oder einer Unterstützungskasse steuerlich behandelt?",
    "Wie werden Leistungen aus einer Direktversicherung, Pensionskasse oder einem Pensionsfonds in der Auszahlungsphase besteuert?"
]

# Process and retrieve answers for the queries
for query in queries:
    print(f"Query: {query}")
    relevant_sentences = search_query(query, index, k=5)
    precise_answer = extract_precise_answer(relevant_sentences, query)
    print(f"Precise answer: {precise_answer}")
    print("\n" + "="*50 + "\n")


Query: Wie hoch ist die Grundzulage?
Precise answer: Die Zulage wird nicht 
gekürzt.


Query: Wie werden Versorgungsleistungen aus einer Direktzusage oder einer Unterstützungskasse steuerlich behandelt?
Precise answer: Bei der Versorgung über eine Direktzusage oder Unterstützungskasse fließt der 
Arbeitslohn erst im Zeitpunkt der Zahlung der Altersversorgungsleistungen an den 
Arbeitnehmer zu.


Query: Wie werden Leistungen aus einer Direktversicherung, Pensionskasse oder einem Pensionsfonds in der Auszahlungsphase besteuert?
Precise answer: Bei der 
ausgleichsberechtigten Person unterliegen Leistungen aus Altersvorsorgeverträgen, 
Pensionsfonds, Pensionskassen oder Direktversicherungen, die auf dem nach § 3 Nr.


