### Install dependencies 

In [20]:
!pip install sentence-transformers chromadb transformers torch PyPDF2


[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




### Imports

In [21]:
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration
from PyPDF2 import PdfReader
import chromadb
import uuid
import os

### Initialize models

In [22]:
# BERT embeddings model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# GPT2 for answer generation
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

print("Models loaded successfully")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



Models loaded successfully


### Initialize ChromaDB (in-memory)

In [24]:
client = chromadb.Client()
collection = client.create_collection(name="rag_docs_db")

print("ChromaDB collection created")

ChromaDB collection created


### Load and read document (PDF or TXT)

In [25]:
FILE_PATH = r"C:\Users\Osama Haider\Downloads\Scholarship Recipients Questions.pdf"  # or sample.txt

def load_document(file_path):
    texts = []

    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            text = page.extract_text()
            if text:
                texts.append(text)

    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            texts.append(f.read())

    return texts

documents = load_document(FILE_PATH)
print(f"Loaded {len(documents)} document(s)")


Loaded 4 document(s)


### Chunk the document

In [26]:
def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap

    return chunks

all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

print(f"Total chunks created: {len(all_chunks)}")

Total chunks created: 17


### Embed and store in ChromaDB

In [27]:
for chunk in all_chunks:
    collection.add(
        ids=[str(uuid.uuid4())],
        documents=[chunk],
        embeddings=[embedding_model.encode(chunk).tolist()]
    )

print("All chunks embedded and stored")

All chunks embedded and stored


### Retrieval function (RAG step)

In [28]:
def retrieve_docs(query, top_k=3):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results["documents"][0]

### Prompt builder

In [29]:
def build_prompt(query, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)

    return f"""
Answer the question using ONLY the context below.
If the answer is not in the context, say:
"I cannot find this information in the provided documents."

Context:
{context}

Question:
{query}

Answer:
""".strip()


### Answer generation (GPT-2)

In [30]:
def generate_answer(query, retrieved_chunks, max_new_tokens=150):
    prompt = build_prompt(query, retrieved_chunks)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


### Chat with your document

In [None]:
def is_valid_question(query):
    return len(query.split()) > 2

while True:
    query = input("\nAsk a question (or type 'exit'): ")

    if query.lower() == "exit":
        break

    if not is_valid_question(query):
        print("Please ask a meaningful question about the document.")
        continue

    retrieved_chunks = retrieve_docs(query)
    answer = generate_answer(query, retrieved_chunks)

    print("\nAnswer:")
    print(answer)



Ask a question (or type 'exit'):  Which university is he studying at?



Answer:
University of Milano -Bicocca



Ask a question (or type 'exit'):  Which technologies has he mainly worked with?



Answer:
backend development



Ask a question (or type 'exit'):  What was his CGPA in the bachelorâ€™s degree?



Answer:
3.57/4
