# Install dependencies

In [None]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install transformers
!pip install paddlepaddle paddleocr
!pip install pdf2image
!pip install tabula-py
!pip install pdfplumber
!pip install reportlab

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

In [None]:
!python -m spacy download en_core_web_sm
!apt-get update
!apt-get install -y poppler-utils


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubun

# Chunking setup(pdf to pickle)

In [None]:
import pickle
import faiss
import pdfplumber
from pdf2image import convert_from_path
import numpy as np
from paddleocr import PaddleOCR
from sentence_transformers import SentenceTransformer
import nltk
import spacy
import tabula  # for table extraction

# Initialize PaddleOCR (CPU version)
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # use_gpu omitted for CPU

# Initialize Sentence-BERT model for embeddings
sbert_model = SentenceTransformer('all-mpnet-base-v2')  # Default device is CPU

# Download and load NLP tools
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

#############################################
# Extraction Functions with Metadata
#############################################

def extract_text_and_images(pdf_path):
    """
    Extracts text from each page and identifies pages that have images.
    Returns:
        text_pages: List of dicts with keys 'page' and 'text'
        image_pages: List of page numbers that contain images
    """
    text_pages = []
    image_pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                text_pages.append({"page": i, "text": text})
            if page.images:
                image_pages.append(i)
    return text_pages, image_pages

def extract_text_from_images(image_pages, pdf_path):
    """
    Converts PDF pages that contain images to text using OCR.
    Returns:
        text_from_images: List of dicts with keys 'page' and 'text'
    """
    images = convert_from_path(pdf_path)
    text_from_images = []
    for i in image_pages:
        image_cv = np.array(images[i])
        result = ocr.ocr(image_cv, cls=True)
        if result and len(result) > 0:
            text = "\n".join([line[1][0] for line in result[0]])
        else:
            text = ""
        text_from_images.append({"page": i, "text": text})
    return text_from_images

def extract_tables_from_pdf(pdf_path):
    """
    Extracts tables from the PDF using tabula.
    Returns:
        tables: A list of tables (each table is a DataFrame or list of rows)
    """
    return tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

#############################################
# Adaptive Chunking Function
#############################################

def adaptive_chunking(text, max_chunk_size=1024, overlap=50):
    """
    Splits the text into coherent chunks using sentence boundaries from spaCy.
    Merges smaller chunks with the previous one if needed.
    """
    doc = nlp(text)
    chunks = []
    current_chunk = []
    current_length = 0
    for sent in doc.sents:
        sentence_text = sent.text.strip()
        sentence_length = len(sentence_text.split())
        if current_length + sentence_length > max_chunk_size:
            merged_chunk = " ".join(current_chunk)
            if chunks and len(merged_chunk.split()) < (max_chunk_size // 2):
                chunks[-1] += " " + merged_chunk
            else:
                chunks.append(merged_chunk)
            # Keep some overlap from the previous chunk for context
            current_chunk = current_chunk[-overlap:]
            current_length = sum(len(chunk.split()) for chunk in current_chunk)
        current_chunk.append(sentence_text)
        current_length += sentence_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

#############################################
# Encode and Store in FAISS with Metadata
#############################################

def encode_and_store_in_faiss(documents):
    """
    Processes each document (which includes metadata like page numbers),
    chunks the text, computes embeddings, and stores both the FAISS index and
    the chunked documents (with metadata) in pickle files.
    """
    chunked_documents = []  # each item will be a dict with keys: 'text' and 'page'
    for doc in documents:
        # Each doc is a dict with "page" and "text"
        chunks = adaptive_chunking(doc['text'])
        for chunk in chunks:
            chunked_documents.append({"text": chunk, "page": doc["page"]})

    # Prepare texts for embedding
    chunk_texts = [d["text"] for d in chunked_documents]
    document_embeddings = sbert_model.encode(chunk_texts, convert_to_tensor=False)
    document_embeddings = np.array(document_embeddings)
    # Normalize embeddings for cosine similarity
    document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1, keepdims=True)

    dimension = document_embeddings.shape[1]
    # Create a FAISS HNSW index (CPU-based)
    cpu_index = faiss.IndexHNSWFlat(dimension, 32)
    cpu_index.add(document_embeddings)

    # Save the FAISS index and the chunked documents with metadata
    with open('faiss_index.pkl', 'wb') as f:
        pickle.dump(cpu_index, f)
    with open('documents.pkl', 'wb') as f:
        pickle.dump(chunked_documents, f)

    print("HNSW FAISS (CPU-based) index and documents (with metadata) saved successfully!")

#############################################
# Example Usage
#############################################

pdf_path = 'test2.pdf'  # Change to your PDF file path

# Extract text from pages and identify image pages
text_pages, image_pages = extract_text_and_images(pdf_path)

# Extract text from images on the identified pages
ocr_text = extract_text_from_images(image_pages, pdf_path)

# Extract tables from the PDF
tables = extract_tables_from_pdf(pdf_path)

# Combine all documents as a list of dictionaries with metadata.
# For text and OCR-extracted pages, the metadata is the page number.
documents = text_pages + ocr_text

# For tables, we tag them with a page value of "table" (or you can customize as needed)
for table in tables:
    table_text = "\n".join([str(row) for row in table])
    documents.append({"page": "table", "text": table_text})

# Encode the documents into embeddings and store in FAISS with metadata.
encode_and_store_in_faiss(documents)


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:16<00:00, 242.93it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:17<00:00, 556.09it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:14<00:00, 148.90it/s]

[2025/02/15 05:53:31] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[2025/02/15 05:53:48] ppocr DEBUG: dt_boxes num : 27, elapsed : 0.37345361709594727
[2025/02/15 05:53:48] ppocr DEBUG: cls num  : 27, elapsed : 0.09805727005004883
[2025/02/15 05:53:54] ppocr DEBUG: rec_res num  : 27, elapsed : 5.324774265289307
[2025/02/15 05:53:54] ppocr DEBUG: dt_boxes num : 40, elapsed : 0.17875313758850098
[2025/02/15 05:53:54] ppocr DEBUG: cls num  : 40, elapsed : 0.09998631477355957
[2025/02/15 05:53:57] ppocr DEBUG: rec_res num  : 40, elapsed : 3.2013463973999023
[2025/02/15 05:53:57] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.17933320999145508
[2025/02/15 05:53:58] ppocr DEBUG: cls num  : 42, elapsed : 0.08261847496032715
[2025/02/15 05:54:04] ppocr DEBUG: rec_res num  : 42, elapsed : 6.075507879257202
[2025/02/15 05:54:04] ppocr DEBUG: dt_boxes num : 30, elapsed : 0.24923443794250488
[2025/02/15 05:54:04] ppocr DEBUG: cls num  : 30, elapsed : 0.0876626968383789
[2025/02/15 05:54:08] ppocr DEBUG: rec_res num  : 30, elapsed : 3.9716992378234863
[2025/02/15 05:

Feb 15, 2025 5:54:21 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Feb 15, 2025 5:54:21 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>



HNSW FAISS (CPU-based) index and documents (with metadata) saved successfully!


# MCQ CSV Generator

In [None]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from google import genai
import time
import pandas as pd

# ----------------------------
# Load the FAISS index and document metadata
# ----------------------------
with open('faiss_index.pkl', 'rb') as f:
    faiss_index = pickle.load(f)
with open('documents.pkl', 'rb') as f:
    documents_metadata = pickle.load(f)  # Each entry is assumed to be a dict with keys like "text" and "page"

# ----------------------------
# Initialize the Sentence-BERT model for embedding queries (if needed)
# ----------------------------
sbert_model = SentenceTransformer('all-mpnet-base-v2')

# ----------------------------
# Initialize the Gemini Client with your API key
# ----------------------------
gemini_api_key = "Your_Gemini_API"
client = genai.Client(api_key=gemini_api_key)

# ----------------------------
# Function to generate MCQs (including a Topic field) for a given text chunk
# ----------------------------
def generate_mcqs_from_topic(context):
    """
    Uses Gemini to generate three multiple-choice questions (MCQs) from the provided context.
    For each question, it returns the difficulty, topic, question text, four options, and the correct answer.
    """
    prompt = (
        "You are an educator specialized in creating multiple-choice questions (MCQs). "
        "Generate three MCQs based solely on the following context. "
        "The first question should be easy, the second question should be of medium difficulty, "
        "and the third question should be hard. "
        "For each question, in addition to the difficulty, question text, and options, also provide a topic that best represents the key subject area of the question. "
        "Each question should include four answer options labeled A, B, C, and D. ONLY 1 option MUST BE correct, The other options should be decieving/plausible but should be incorrect"
        "FOCUS on both The quality of the questions and the options that are being generated"
        "Do not include any explanations. do NOT include or "
        "reference the context in the questions themselves. Create questions that are general and concept-based"
        "Format the output as follows for each question:\n\n"
        "Difficulty: <Easy/Medium/Hard>\n"
        "Topic: <topic text>\n"
        "Question: <question text>\n"
        "A) <option text>\n"
        "B) <option text>\n"
        "C) <option text>\n"
        "D) <option text>\n"
        "Answer: <the option label that is correct>\n\n"
        "Context:\n" + context
    )

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt]
    )
    answer = response.text.strip() if response.text else "No questions generated."
    return answer

# ----------------------------
# Example: Parsing Gemini Output into Structured Data (for CSV export)
# ----------------------------
def parse_mcqs(gemini_output, chunk_id, page_info):
    """
    Parses the Gemini output (assumed to be in the specified format) into a list of dictionaries.
    Each dictionary will contain the keys: chunk, page, difficulty, topic, question, option_A, option_B, option_C, option_D, correct_answer.

    NOTE: This parser is simplistic and assumes the output strictly follows the provided format.
    """
    mcq_list = []
    # Split output into individual questions by double newlines
    questions = gemini_output.strip().split("\n\n")

    for q in questions:
        lines = [line.strip() for line in q.splitlines() if line.strip()]
        if len(lines) < 7:
            continue  # Skip if not enough lines for a full MCQ

        mcq = {"chunk": chunk_id, "page": page_info}

        for line in lines:
            # Check if the line starts with an asterisk and remove it,
            # while keeping track that this option was marked.
            star_present = False
            if line.startswith("*"):
                star_present = True
                line = line[1:].strip()  # Remove the asterisk for further processing

            if line.startswith("Difficulty:"):
                mcq["difficulty"] = line.split("Difficulty:")[-1].strip()
            elif line.startswith("Topic:"):
                mcq["topic"] = line.split("Topic:")[-1].strip()
            elif line.startswith("Question:"):
                mcq["question"] = line.split("Question:")[-1].strip()
            elif line.startswith("A)"):
                text = line.split("A)")[1].strip()
                mcq["option_A"] = text
                if star_present:
                    mcq["correct_answer"] = "A"
            elif line.startswith("B)"):
                text = line.split("B)")[1].strip()
                mcq["option_B"] = text
                if star_present:
                    mcq["correct_answer"] = "B"
            elif line.startswith("C)"):
                text = line.split("C)")[1].strip()
                mcq["option_C"] = text
                if star_present:
                    mcq["correct_answer"] = "C"
            elif line.startswith("D)"):
                text = line.split("D)")[1].strip()
                mcq["option_D"] = text
                if star_present:
                    mcq["correct_answer"] = "D"
            elif line.startswith("Answer:"):
                # This will override any previous setting if present.
                mcq["correct_answer"] = line.split("Answer:")[-1].strip()
        mcq_list.append(mcq)
    return mcq_list

# ----------------------------
# Main loop: Generate MCQs for each document chunk and store in a CSV file
# ----------------------------
all_mcqs = []  # This will hold dictionaries for every MCQ generated

print("Generating MCQs from all topics present in the documents...\n")

# Iterate over each document chunk from the pickle file.
# (Each document in documents_metadata is assumed to be a dictionary with at least 'text' and 'page'.)
for idx, doc_entry in enumerate(documents_metadata):
    context = doc_entry.get("text", "")
    page_info = doc_entry.get("page", "Unknown")

    print(f"--- Generating MCQs for Document Chunk {idx+1} (Page: {page_info}) ---\n")

    # Generate MCQs for this context using Gemini
    gemini_output = generate_mcqs_from_topic(context)

    # Parse the output into structured MCQ dictionaries
    mcqs_for_chunk = parse_mcqs(gemini_output, chunk_id=idx+1, page_info=page_info)

    # Append to the master list
    all_mcqs.extend(mcqs_for_chunk)

    # Optionally print the Gemini output for debugging
    print(gemini_output)
    print("\n" + "="*80 + "\n")

    # Sleep briefly to avoid rate-limiting (adjust as needed)
    time.sleep(2)

# ----------------------------
# Save the structured MCQs into a CSV file using pandas
# ----------------------------
df_mcqs = pd.DataFrame(all_mcqs)
df_mcqs.to_csv("mcqs.csv", index=False)
print("MCQs successfully saved to mcqs.csv")


Generating MCQs from all topics present in the documents...

--- Generating MCQs for Document Chunk 1 (Page: 0) ---

Difficulty: Easy
Topic: Definition of Organizational Behavior
Question: Which of the following BEST describes the primary focus of Organizational Behavior?
A) Optimizing financial performance through market analysis.
B) Understanding, predicting, and managing human behavior in organizations.
C) Developing efficient manufacturing processes using engineering principles.
D) Implementing technological advancements to automate routine tasks.
Answer: B

Difficulty: Medium
Topic: Scope of Organizational Behavior
Question: Organizational Behavior's scope includes the study of which of the following?
A) Macroeconomic trends and their impact on global trade.
B) Individual and group dynamics, as well as the influence of organizational structures.
C) Legal frameworks governing business operations and compliance standards.
D) Technological infrastructure and its role in data manageme

# Frontend for quiz

In [None]:
code = """import streamlit as st
import pandas as pd
import random
import time
import base64
from streamlit_autorefresh import st_autorefresh

df = pd.read_csv("mcqs.csv")

if "current_difficulty" not in st.session_state:
    st.session_state.current_difficulty = "Medium"
if "recent_topics" not in st.session_state:
    st.session_state.recent_topics = []
if "score" not in st.session_state:
    st.session_state.score = 0
if "total_questions" not in st.session_state:
    st.session_state.total_questions = 0
if "question_history" not in st.session_state:
    st.session_state.question_history = []
if "current_question" not in st.session_state:
    st.session_state.current_question = None
if "start_time" not in st.session_state:
    st.session_state.start_time = None

st_autorefresh(interval=1000, key="timer_autorefresh")

def update_difficulty():
    history = st.session_state.question_history[-3:]
    if len(history) > 0:
        avg_correct = sum(1 for q in history if q.get("was_correct", False)) / len(history)
        mapping = {"Easy": 1, "Medium": 2, "Hard": 3}
        reverse_mapping = {1: "Easy", 2: "Medium", 3: "Hard"}
        current_level = mapping[st.session_state.current_difficulty]
        if avg_correct > 0.8 and current_level < 3:
            new_level = current_level + 1
        elif avg_correct < 0.5 and current_level > 1:
            new_level = current_level - 1
        else:
            new_level = current_level
        st.session_state.current_difficulty = reverse_mapping[new_level]

def choose_next_question():
    candidates = df[
        (df["difficulty"] == st.session_state.current_difficulty) &
        (~df["topic"].isin(st.session_state.recent_topics))
    ]
    if candidates.empty:
        candidates = df[df["difficulty"] == st.session_state.current_difficulty]
    if candidates.empty:
        candidates = df.copy()
    chosen = candidates.sample(n=1).iloc[0]
    return chosen

if st.session_state.total_questions >= 10:
    st.header("Quiz Complete!")
    st.write(f"Your final score is: **{st.session_state.score} / {st.session_state.total_questions}**")

    history_df = pd.DataFrame(st.session_state.question_history)[["question", "user_response", "correct_answer"]]
    st.write("Review of your responses:")
    st.write(history_df)

    csv = history_df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    download_link = (
        f'<a id="download_csv" href="data:file/csv;base64,{b64}" download="quiz_responses.csv"></a>'
        f'<script>document.getElementById("download_csv").click();</script>'
    )
    st.markdown(download_link, unsafe_allow_html=True)
    st.stop()

if st.session_state.current_question is None:
    st.session_state.current_question = choose_next_question()
    st.session_state.start_time = time.time()

elapsed_time = time.time() - st.session_state.start_time
remaining_time = max(0, 60 - int(elapsed_time))

if remaining_time <= 0:
    st.warning("Time's up! Moving to the next question.")
    correct_label = st.session_state.current_question["correct_answer"]
    correct_option = st.session_state.current_question[f"option_{correct_label}"]

    st.session_state.question_history.append({
        "question": st.session_state.current_question["question"],
        "user_response": "No Answer (Timed Out)",
        "correct_answer": correct_option,
        "was_correct": False,
        "topic": st.session_state.current_question["topic"]
    })

    st.session_state.total_questions += 1
    update_difficulty()

    st.session_state.recent_topics.append(st.session_state.current_question["topic"])
    if len(st.session_state.recent_topics) > 3:
        st.session_state.recent_topics.pop(0)

    st.session_state.current_question = choose_next_question()
    st.session_state.start_time = time.time()
    st.experimental_rerun()

st.header("Dynamic Quiz")
st.write(f"**Question {st.session_state.total_questions + 1} of 10**")
st.write(f"**Topic:** {st.session_state.current_question['topic']}")
st.write(st.session_state.current_question["question"])
st.write(f"**Time Remaining:** {remaining_time} seconds")

options = [
    st.session_state.current_question["option_A"],
    st.session_state.current_question["option_B"],
    st.session_state.current_question["option_C"],
    st.session_state.current_question["option_D"],
]
selected = st.radio("Select your answer:", options)

if st.button("Submit Answer"):
    st.session_state.total_questions += 1
    correct_label = st.session_state.current_question["correct_answer"]
    correct_option = st.session_state.current_question[f"option_{correct_label}"]

    if selected.strip() == correct_option.strip():
        st.success("Correct!")
        st.session_state.score += 1
        was_correct = True
    else:
        st.error(f"Incorrect! The correct answer was: {correct_option}")
        was_correct = False

    st.session_state.question_history.append({
        "question": st.session_state.current_question["question"],
        "user_response": selected,
        "correct_answer": correct_option,
        "was_correct": was_correct,
        "topic": st.session_state.current_question["topic"]
    })

    st.session_state.recent_topics.append(st.session_state.current_question["topic"])
    if len(st.session_state.recent_topics) > 3:
        st.session_state.recent_topics.pop(0)

    update_difficulty()

    st.session_state.current_question = choose_next_question()
    st.session_state.start_time = time.time()
    st.experimental_rerun()
"""

with open("app.py", "w") as f:
    f.write(code)


# Content Generator

In [None]:
user_preferences = {
    "complexity": 3,         # 1 = very simple, 5 = highly complex
    "length": 3,             # 1 = very short, 5 = very long
    "real_life_examples": 3  # 1 = none, 5 = many detailed examples
}


In [None]:
import pickle
import faiss
import numpy as np
import time
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from google import genai
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
)
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors

# ------------------------------
# Helper: Markdown to HTML Conversion
# ------------------------------
def convert_markdown_to_html(text):
    """
    Convert markdown-style bold markers **text** into HTML <b>text</b> tags,
    and replace newline characters with HTML <br /> tags for proper paragraphing.
    """
    pattern = re.compile(r"\*\*(.*?)\*\*")
    text = pattern.sub(r"<b>\1</b>", text)
    # Replace newline characters with <br /> tags
    text = text.replace("\n", "<br />")
    return text


# ------------------------------
# Load Models, FAISS Index & Data
# ------------------------------

# Load the FAISS index and document metadata
with open('faiss_index.pkl', 'rb') as f:
    faiss_index = pickle.load(f)
with open('documents.pkl', 'rb') as f:
    documents_metadata = pickle.load(f)

# Initialize SBERT and Gemini
sbert_model = SentenceTransformer('all-mpnet-base-v2')
gemini_api_key = "Your_Gemini_API"
client = genai.Client(api_key=gemini_api_key)

# ------------------------------
# Core Functions
# ------------------------------

def get_context_for_question(question_text, documents_metadata):
    """
    Find the most relevant document context for a given question using SBERT embeddings.
    """
    question_embedding = sbert_model.encode([question_text])[0]
    question_embedding = np.float32(question_embedding)
    question_embedding = np.expand_dims(question_embedding, axis=0)

    D, I = faiss_index.search(question_embedding, k=1)
    relevant_doc = documents_metadata[I[0][0]]
    return relevant_doc['text'], relevant_doc['page']

def analyze_student_response(question, student_response, correct_answer, context, preferences):
    """
    Generate a personalized explanation based on the student's response, context,
    and user preferences.
    """
    # Define mappings for clarity in the prompt
    complexity_mapping = {
        1: "very simple",
        2: "simple",
        3: "moderately complex",
        4: "complex",
        5: "highly complex"
    }
    length_mapping = {
        1: "very short",
        2: "short",
        3: "moderate",
        4: "long",
        5: "very long"
    }
    examples_mapping = {
        1: "no real-life examples",
        2: "small real life example",
        3: "one real life example",
        4: "one real life example",
        5: "two real life examples"
    }

    # Use preferences provided by the user
    complexity = complexity_mapping.get(preferences.get("complexity", 3))
    length = length_mapping.get(preferences.get("length", 3))
    examples = examples_mapping.get(preferences.get("real_life_examples", 3))

    # Build the prompt with user preferences
    prompt = f"""
    As an educational assistant, analyze this student's response and provide a clean,direct and concise explanation.
    Please ensure the explanation is {complexity} in complexity/depth of knowledge, {length} in length, and includes {examples}.

    Question: {question}
    Student's Response: {student_response}
    Correct Answer: {correct_answer}
    Context from Learning Material: {context}

    Explain the concept related to the question and context, balancing detail and brevity.
    Provide only the essential explanation without prefatory comments.
    Focus on the quality of the explanation.
    If there is an exceptionally good fun fact related to the concept, include it.
    Start the real-life example and fun fact on a new line for clarity.
    """

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt]
    )
    return response.text.strip()

def process_student_responses(responses_df, user_preferences):
    """
    Process all student responses and generate personalized feedback.
    """
    analysis_results = []

    for index, row in responses_df.iterrows():
        # Retrieve context for the question
        context, page_num = get_context_for_question(row['question'], documents_metadata)

        # Generate personalized explanation
        explanation = analyze_student_response(
            row['question'],
            row['user_response'],
            row['correct_answer'],
            context,
            user_preferences  # Passing the user preferences here
        )

        analysis_results.append({
            'question': row['question'],
            'student_response': row['user_response'],
            'correct_answer': row['correct_answer'],
            'page_number': page_num,
            'personalized_explanation': explanation,
            'is_correct': row['user_response'].strip() == row['correct_answer'].strip()
        })

        # Pause to avoid rate limiting
        time.sleep(2)

    return pd.DataFrame(analysis_results)

def generate_summary_report(analysis_df):
    """
    Generate a summary report of the student's performance.
    """
    total_questions = len(analysis_df)
    correct_answers = sum(analysis_df['is_correct'])
    performance = (correct_answers / total_questions) * 100

    topics_needing_review = analysis_df[~analysis_df['is_correct']]['page_number'].tolist()

    return {
        'total_questions': total_questions,
        'correct_answers': correct_answers,
        'performance_percentage': performance,
        'pages_needing_review': sorted(set(topics_needing_review))
    }

# ------------------------------
# Beautiful PDF Generation
# ------------------------------

def generate_beautiful_pdf_report(analysis_df, summary, output_filename="refined_student_analysis.pdf"):
    """
    Generate a refined PDF report that is well-structured, beautiful, and easy to read.
    """
    doc = SimpleDocTemplate(output_filename,
                            pagesize=letter,
                            rightMargin=72, leftMargin=72,
                            topMargin=72, bottomMargin=72)
    styles = getSampleStyleSheet()

    # Define custom styles
    title_style = ParagraphStyle(
        'TitleStyle',
        parent=styles['Title'],
        fontSize=24,
        leading=28,
        alignment=1,  # center-aligned
        spaceAfter=24,
    )
    heading_style = ParagraphStyle(
        'HeadingStyle',
        parent=styles['Heading2'],
        fontSize=18,
        leading=22,
        spaceAfter=12,
    )
    normal_style = ParagraphStyle(
        'NormalStyle',
        parent=styles['BodyText'],
        fontSize=12,
        leading=16,
        spaceAfter=10,
    )

    story = []

    # Title Page
    story.append(Paragraph("Student Performance Analysis", title_style))
    story.append(Spacer(1, 0.5*inch))

    # Summary Section
    story.append(Paragraph("Summary", heading_style))
    summary_data = [
        ["Total Questions:", summary['total_questions']],
        ["Correct Answers:", summary['correct_answers']],
        ["Performance:", f"{summary['performance_percentage']:.2f}%"],
        ["Pages Needing Review:", ", ".join(map(str, summary['pages_needing_review'])) if summary['pages_needing_review'] else "None"]
    ]
    table = Table(summary_data, colWidths=[200, 250])
    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightblue),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
        ('GRID', (0, 0), (-1, -1), 1, colors.grey)
    ]))
    story.append(table)
    story.append(PageBreak())

    # Detailed Analysis Section
    story.append(Paragraph("Detailed Analysis", heading_style))
    story.append(Spacer(1, 0.2*inch))

    # For each question, add a clearly formatted block
    for idx, row in analysis_df.iterrows():
        story.append(Paragraph(f"<b>Question:</b> {row['question']}", normal_style))
        story.append(Paragraph(f"<b>Your Response:</b> {row['student_response']}", normal_style))
        story.append(Paragraph(f"<b>Correct Answer:</b> {row['correct_answer']}", normal_style))
        story.append(Paragraph(f"<b>Page Number:</b> {row['page_number']}", normal_style))

        # Convert markdown-style bold markers in the personalized explanation to HTML bold tags.
        personalized_explanation_html = convert_markdown_to_html(row['personalized_explanation'])
        story.append(Paragraph(f"<b>Personalized Explanation:</b> {personalized_explanation_html}", normal_style))
        story.append(Paragraph(f"<b>Is Correct:</b> {row['is_correct']}", normal_style))
        story.append(Spacer(1, 0.3*inch))

    doc.build(story)
    print(f"Beautiful PDF generated: {output_filename}")

# ------------------------------
# Function to Collect User Feedback for Preferences
# ------------------------------

def collect_user_feedback():
    """
    Prompt the user to provide feedback for explanation preferences.
    Returns a dictionary with the updated preferences.
    """
    print("\nPlease provide your preferences for the explanation content.")

    def get_valid_input(prompt_text):
        while True:
            try:
                value = int(input(prompt_text))
                if 1 <= value <= 5:
                    return value
                else:
                    print("Please enter a number between 1 and 5.")
            except ValueError:
                print("Invalid input. Please enter an integer between 1 and 5.")

    complexity = get_valid_input("On a scale of 1-5, how complex should the explanation be? (1: very simple, 5: highly complex): ")
    length = get_valid_input("On a scale of 1-5, what length do you prefer for the explanation? (1: very short, 5: very long): ")
    real_life_examples = get_valid_input("On a scale of 1-5, how many real-life examples should be included? (1: none, 5: many detailed examples): ")

    user_preferences = {
        "complexity": complexity,
        "length": length,
        "real_life_examples": real_life_examples
    }

    print("\nThank you for your feedback!")
    return user_preferences

# ------------------------------
# Main Execution
# ------------------------------

def main():
    # Read student responses from CSV
    responses_df = pd.read_csv('Responses.csv')

    # Optionally, collect user feedback for explanation preferences
    user_preferences = collect_user_feedback()

    # Process responses to generate personalized feedback using user preferences
    print("\nAnalyzing student responses and generating personalized feedback...")
    analysis_df = process_student_responses(responses_df, user_preferences)

    # Generate summary report
    summary = generate_summary_report(analysis_df)

    # Generate the final, beautiful PDF report
    generate_beautiful_pdf_report(analysis_df, summary, output_filename="refined_student_analysis.pdf")

    # Print summary to console
    print("\nAnalysis Summary:")
    print(f"Total Questions: {summary['total_questions']}")
    print(f"Correct Answers: {summary['correct_answers']}")
    print(f"Performance: {summary['performance_percentage']:.2f}%")
    print(f"Pages Needing Review: {summary['pages_needing_review']}")
    print("\nDetailed analysis saved to 'refined_student_analysis.pdf'")

if __name__ == "__main__":
    main()



Please provide your preferences for the explanation content.
On a scale of 1-5, how complex should the explanation be? (1: very simple, 5: highly complex): 5
On a scale of 1-5, what length do you prefer for the explanation? (1: very short, 5: very long): 5
On a scale of 1-5, how many real-life examples should be included? (1: none, 5: many detailed examples): 2

Thank you for your feedback!

Analyzing student responses and generating personalized feedback...
Beautiful PDF generated: refined_student_analysis.pdf

Analysis Summary:
Total Questions: 10
Correct Answers: 3
Performance: 30.00%
Pages Needing Review: [3, 10, 11, 12, 13, 14]

Detailed analysis saved to 'refined_student_analysis.pdf'
