In [124]:
import google.generativeai as genai
import os
import re
import openai
from dotenv import load_dotenv, find_dotenv
from PyPDF2 import PdfMerger
import logging
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [125]:
# Function to extract text from pdf's

import pdfplumber

logging.getLogger("pdfminer").setLevel(logging.ERROR) # Only errors and critical issues will be shown, warnings will be hidden

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += (page.extract_text() or "") + "\n"
    return text.strip()

In [126]:
# Give multiple pdf's as input up to five

pdf_paths = ['/Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L04. Sorting algorithms_Part2_Day5.pdf',
             '/Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L04. Sorting algorithms_Part2_Day5.pdf',
             '/Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L02. Search algorithms_Day3.pdf',
             '/Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L03. Sorting algorithms_Part1_Day4.pdf',
             '/Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L07. Linked List_Day8.pdf']

# sample paths
# /Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L01. Introduction and DSA_Day2.pdf 
# /Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L01. Introduction and DSA_Day1.pdf 
# /Users/ashutoshpatil/Desktop/genai/data_structures_&_algorithms_resources/L02. Search algorithms_Day3.pdf 

In [127]:
# Merge into single pdf

merged_pdf = "merged_output.pdf"
merger = PdfMerger()
for path in pdf_paths:
    merger.append(path)
merger.write(merged_pdf)
merger.close()

In [128]:
text = extract_text_from_pdf('merged_output.pdf')

In [129]:
# Text pre-proceessing

def flatten_text(text):   # used to remove \n
    lines = text.splitlines()
    clean_lines = [line.strip() for line in lines if line.strip()]
    return " ".join(clean_lines)

def remove_boilerplate(text):
    text = re.sub(r'Page\s*\d+', '', text)  # remove "Page 1", "Page 2", ©. All rights reserve etc.
    text = re.sub(r'©.*?All rights reserved\.', '', text, flags=re.IGNORECASE)
    return text

def normalize_bullets(text):
    # Convert bullet symbols to dashes
    text = re.sub(r'[•●▪‣·]', '-', text)
    text = re.sub(r'[\uf0a7\uf0b7\uf076\uf0d8\uf0a3]', '-', text)
    return text

def normalize_whitespace(text):
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Replaces two or more newline character with single newline  
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()

def clean_pdf_text(text):
    text = flatten_text(text)
    text = remove_boilerplate(text)
    text = normalize_bullets(text)
    text = normalize_whitespace(text)
    return text

In [130]:
cleaned_text = clean_pdf_text(text)

In [131]:
# Chunking of data

splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)  # overlap is used to maintain context between chunks
chunks = splitter.split_text(cleaned_text)

In [132]:
model = SentenceTransformer("all-MiniLM-L6-v2")  # This model converts chunks into numerical embeddings (384) 
embeddings = model.encode(chunks)

index = faiss.IndexFlatL2(embeddings.shape[1]) # creates vector search index using Euclidean dist 
index.add(np.array(embeddings))

In [133]:

query = "Collect data about linked lists"
q_embed = model.encode([query])
D, I = index.search(q_embed, k=5)  # will find top 5 most similar chunks to query
retrieved_chunks = [chunks[i] for i in I[0]]
context = "\n\n".join(retrieved_chunks)

## Tried using local LLM model Mistral - 7b , but did not get desired results

In [134]:
# model_path = "/Users/ashutoshpatil/Desktop/genai/models/mistrial/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
# llm = Llama(model_path=model_path, n_ctx=512, n_threads=4)

In [135]:
load_dotenv() # Loads variables from .env file
api_key = os.getenv("MY_API_KEY")

In [136]:
genai.configure(api_key=api_key)
model_gemini = genai.GenerativeModel('gemini-2.0-flash')

In [137]:
# To make the quiz interactive

def run_quiz_from_response(response_text):
    global wrong_que  
    questions = re.split(r'\nQ\d+\.', response_text)
    score = 0
    total = 0
    wrong_que = []

    for q in questions[1:]:  # skip the first empty split
        lines = q.strip().split('\n')
        question = lines[0].strip()
        options = lines[1:5]
        correct_line = next((line for line in lines if line.lower().startswith("correct answer")), "")
        correct_answer = correct_line.split(":")[-1].strip().upper()

        print(f"\n{question}")
        for opt in options:
            print(opt)

        user_ans = input("Your answer (A/B/C/D): ").strip().upper()
        if user_ans == correct_answer:
            print("✅ Correct!")
            score += 1
        else:
            print(f"❌ Incorrect. Correct answer: {correct_answer}")
            wrong_que.append(question)
        total += 1

    print(f"\n🎯 Final Score: {score}/{total}")
    print(f"Percentage : {round((score / total) * 100,2)}% ")

In [138]:
# If text is more than 10000 words then we summarize it under 10000 words to get optimum perofrmance
no_of_questions = 6

# print(response.text)
prompt = f"""
Generate a {no_of_questions} question multiple-choice quiz from the following content.
Each question should have exactly 4 options (A to D) and a separate line for the correct answer.

Format:
                                                 
Q1. <question text>
A. <option>
B. <option>
C. <option>
D. <option>
Correct Answer: <option letter>

Content:
{context}
"""
response = model_gemini.generate_content(prompt)
run_quiz_from_response(response.text)


What does the simplest form of a linked list typically include?
A. A queue
B. An array
C. A header pointer
D. A stack


Your answer (A/B/C/D):  a


❌ Incorrect. Correct answer: C

In a linked list, what is the purpose of the header?
A. To store the last data element
B. To point to the first node of the list
C. To calculate the list's size
D. To manage memory allocation


Your answer (A/B/C/D):  b


✅ Correct!

Which of the following is a key difference between a singly and doubly linked list?
A. Singly linked lists cannot store data.
B. Doubly linked lists contain two address fields.
C. Singly linked lists are always circular.
D. Doubly linked lists don't require a header.


Your answer (A/B/C/D):  b


✅ Correct!

When displaying the contents of a linked list, what is the key action to perform?
A. Reverse the order of the list
B. Follow the pointers
C. Sort the data alphabetically
D. Delete every other node


Your answer (A/B/C/D):  b


✅ Correct!

When traversing and displaying a linked list, when should you stop?
A. When the list is empty
B. When the data is corrupted
C. When the next pointer is NULL
D. After printing the first element


Your answer (A/B/C/D):  c


✅ Correct!

🎯 Final Score: 4/5
Percentage : 80.0% 


In [140]:
# Feedback 
if len(wrong_que) == 0:
    print('You are doing great!')
else:
    prompt = f""" The student got these questions wrong {wrong_que}. Identify and only provide 1 o 2 areas 
                   per question which student needs to work on in the bullet points format. Don't give 
                   question in output"""
    
    response = model_gemini.generate_content(prompt)
    print(response.text)

Here are the areas the student needs to work on:

*   **Basic Linked List Structure:** Understanding the fundamental components.

