In [None]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import faiss

# 1. Load and split the PDF
def load_pdf_chunks(path, chunk_size=500):
    reader = PdfReader(path)
    text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

chunks = load_pdf_chunks("DSA_Roadmap_After_Core_Topics.pdf")

In [3]:
# 2. Create embeddings for chunks
model = SentenceTransformer("all-MiniLM-L6-v2")
chunk_embeddings = model.encode(chunks)

In [4]:
# 3. Build FAISS index
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
index.add(np.array(chunk_embeddings))

In [5]:
# 4. Define questions
questions = [
    "What topics should I study after core DSA?",
    "Does it mention graph theory or dynamic programming?",
]

In [8]:
# 5. Load QA model (FLAN-T5)
qa_model = pipeline("text2text-generation", model="google/flan-t5-base")

Device set to use cpu


In [7]:
# 6. Ask questions
for question in questions:
    q_embedding = model.encode([question])
    _, I = index.search(np.array(q_embedding), k=1)
    top_chunk = chunks[I[0][0]]

    prompt = f"Answer this based on the text: \"{top_chunk}\"\nQuestion: {question}"
    answer = qa_model(prompt, max_new_tokens=100)[0]['generated_text']
    print(f"Q: {question}\nA: {answer}\n")

Q: What topics should I study after core DSA?
A: Phase 1: Complete Core DSA 1. Recursion & Backtracking Subset sum, permutations, N-Queens, Sudoku solver 2. Binary Search (on arrays & on answers) Search in rotated sorted array, Koko eating bananas, Aggressive cows 3. Divide and Conquer Merge Sort, Quick Sort, Count Inversions Phase 2: Greedy & Heap 4. Greed

Q: Does it mention graph theory or dynamic programming?
A: graph theory

