In [1]:
#!pip3 install google-generativeai PyMuPDF sentence-transformers faiss-cpu opencv-python reportlab pillow


In [2]:
import os, re, json, glob
import fitz  # PyMuPDF
import numpy as np
from pathlib import Path
from typing import List, Dict, Any
from PIL import Image

# RAG / Embeddings
from sentence_transformers import SentenceTransformer, util
import faiss

# PDF reports
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4, inch

# Gemini
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# -------------------------
# Configure Gemini
# -------------------------
os.environ["GOOGLE_API_KEY"] = "AIzaSyCXUQ-6FuRqBQQwc43IEq49dvoHv9usnZ8"  # <-- REPLACE ME
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
GEMINI_MODEL = genai.GenerativeModel("gemini-2.5-flash")

In [4]:
# -------------------------
# Utility: PDF helpers
# -------------------------
def pdf_to_images(pdf_path: str, out_dir: str = "data/tmp_pages", dpi: int = 150) -> List[str]:
    """
    Convert all pages of a PDF into images and return image file paths.
    """
    os.makedirs(out_dir, exist_ok=True)
    im_paths = []
    doc = fitz.open(pdf_path)
    base = Path(pdf_path).stem
    for i, page in enumerate(doc, start=1):
        pix = page.get_pixmap(dpi=dpi)
        ipath = os.path.join(out_dir, f"{base}_page{i}.jpg")
        pix.save(ipath)
        im_paths.append(ipath)
    return im_paths

def read_pdf_text(pdf_path: str) -> str:
    """
    Extracts plaintext from a PDF (for question paper / subject book).
    """
    text = []
    with fitz.open(pdf_path) as doc:
        for p in doc:
            text.append(p.get_text())
    return "\n".join(text)

In [5]:
# ============================================================
# Agent 1 — Document Intelligence (Gemini Vision, no OCR)
# Extracts per-page text + diagram descriptions as JSON.
# ============================================================
def extract_student_pages_json(student_pdf: str) -> List[Dict[str, Any]]:
    """
    For each page of the student's PDF:
      - Convert to image
      - Ask Gemini to extract all text + describe diagrams
      - Return structured list[ {page, text, diagrams} ]
    """
    page_images = pdf_to_images(student_pdf)
    pages = []
    for i, img_path in enumerate(page_images, start=1):
        img = Image.open(img_path)
        prompt = (
            "Extract ALL meaningful information from this answer sheet page.\n"
            "Return JSON with keys: page, text, diagrams.\n"
            "text: the handwritten/printed content verbatim (best effort).\n"
            "diagrams: short descriptions of any diagrams/equations tables.\n"
            "If nothing present for a key, use empty string."
        )
        resp = GEMINI_MODEL.generate_content([prompt, img])
        raw = resp.text.strip()
        # Gemini usually returns JSON-like; be defensive:
        try:
            # Try JSON parse; if it fails, wrap it into a JSON
            data = json.loads(raw)
            if isinstance(data, dict):
                data["page"] = data.get("page", i)
                pages.append(data)
            elif isinstance(data, list):
                # If a list, add all but ensure page is present
                for d in data:
                    if isinstance(d, dict):
                        d["page"] = d.get("page", i)
                        pages.append(d)
            else:
                pages.append({"page": i, "text": raw, "diagrams": ""})
        except Exception:
            pages.append({"page": i, "text": raw, "diagrams": ""})
    return pages

def merge_student_pages(pages_json: List[Dict[str, Any]]) -> Dict[str, str]:
    """
    Merge all page texts and diagram descriptions into a single student-level blob.
    """
    texts = []
    diags = []
    for p in pages_json:
        t = p.get("text", "")
        d = p.get("diagrams", "")
        if t: texts.append(f"[Page {p.get('page', '?')}] {t}")
        if d: diags.append(f"[Page {p.get('page', '?')}] {d}")
    return {
        "text": "\n".join(texts).strip(),
        "diagrams": "\n".join(diags).strip()
    }


In [6]:
# ============================================================
# Agent 2 — Question & Structure Mapper
# Parse Question Paper → parts, qno, marks, question.
# Map student content → question numbers.
# ============================================================
def parse_question_paper_to_schema(qpaper_pdf: str) -> List[Dict[str, Any]]:
    """
    Returns a list of items: {part, qno, marks, question}
    """
    qp_text = read_pdf_text(qpaper_pdf)
    prompt = (
        "Parse the following exam question paper text. "
        "Extract a JSON array where each element has keys: "
        'part (e.g., "Part A"), qno (integer), marks (integer), question (string). '
        "If marks per part are specified in headings (e.g., 'Part A (2 marks each)'), "
        "apply that to each question in that part. Ensure clean integers.\n\n"
        f"Question Paper Text:\n{qp_text}"
    )
    resp = GEMINI_MODEL.generate_content(prompt)
    raw = resp.text.strip()
    try:
        data = json.loads(raw)
        # minimal sanity check & cleanup
        cleaned = []
        for d in data:
            cleaned.append({
                "part": str(d.get("part", "")).strip() or "Unknown",
                "qno": int(d.get("qno", 0)),
                "marks": int(d.get("marks", 0)),
                "question": str(d.get("question", "")).strip()
            })
        # sort by qno
        cleaned.sort(key=lambda x: x["qno"])
        return cleaned
    except Exception:
        # Fallback: naive split by Q\d
        lines = [ln.strip() for ln in qp_text.splitlines() if ln.strip()]
        items = []
        part = "Unknown"
        default_marks = 0
        for ln in lines:
            # detect part + marks
            mpart = re.search(r"(Part\s*[A-Z])\s*\((\d+)\s*marks", ln, flags=re.I)
            if mpart:
                part = mpart.group(1).title()
                default_marks = int(mpart.group(2))
                continue
            mpart2 = re.search(r"(Part\s*[A-Z])", ln, flags=re.I)
            if mpart2:
                part = mpart2.group(1).title()
                continue
            mq = re.match(r"(\d+)[\).:-]\s*(.+)", ln)
            if mq:
                qno = int(mq.group(1))
                qtext = mq.group(2).strip()
                items.append({"part": part, "qno": qno, "marks": default_marks or 1, "question": qtext})
        return items

def map_student_answers_to_questions(student_blob: Dict[str, str],
                                     schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Use Gemini to align student's merged text to each question number.
    Returns list of {part,qno,marks,question,student_answer,student_diagrams}.
    """
    # Create a compact schema string
    schema_str = "\n".join([f"{it['part']} | Q{it['qno']} | {it['marks']} marks | {it['question']}"
                            for it in schema])

    prompt = (
        "You are aligning an answer script to a question paper. "
        "Given the list of questions and the student's entire answer text & diagram notes, "
        "split and map the student's content to each question number. "
        "Return a JSON array where each element has: "
        '{"part": "...", "qno": ..., "marks": ..., "question": "...", '
        '"student_answer": "...", "student_diagrams": "..."} '
        "If a question is unanswered, set student_answer to empty string."
        "\n\nQuestions:\n"
        f"{schema_str}\n\n"
        "Student Answer Text:\n"
        f"{student_blob.get('text','')}\n\n"
        "Student Diagram Notes:\n"
        f"{student_blob.get('diagrams','')}"
    )

    resp = GEMINI_MODEL.generate_content(prompt)
    raw = resp.text.strip()
    try:
        data = json.loads(raw)
        # ensure all qnos present; if not, backfill with defaults
        mapped = []
        for it in data:
            mapped.append({
                "part": it.get("part","Unknown"),
                "qno": int(it.get("qno", 0)),
                "marks": int(it.get("marks", 0)),
                "question": it.get("question","").strip(),
                "student_answer": it.get("student_answer","").strip(),
                "student_diagrams": it.get("student_diagrams","").strip()
            })
        # keep only those present in schema qnos
        valid_qnos = {s["qno"] for s in schema}
        mapped = [m for m in mapped if m["qno"] in valid_qnos]
        # fill in missing qnos with blanks if needed
        present = {m["qno"] for m in mapped}
        for s in schema:
            if s["qno"] not in present:
                mapped.append({
                    "part": s["part"], "qno": s["qno"], "marks": s["marks"],
                    "question": s["question"], "student_answer":"", "student_diagrams":""
                })
        mapped.sort(key=lambda x: x["qno"])
        return mapped
    except Exception:
        # Fallback: naive all text for all questions (hackathon safe default)
        mapped = []
        for s in schema:
            mapped.append({
                "part": s["part"], "qno": s["qno"], "marks": s["marks"], "question": s["question"],
                "student_answer": student_blob.get("text",""), "student_diagrams": student_blob.get("diagrams","")
            })
        return mapped


In [7]:
# ============================================================
# Agent 3 — Concept Retriever (RAG) + Grading Brain (Gemini)
# Build FAISS index over subject book; grade per mapped Q.
# ============================================================
EMBED = SentenceTransformer("all-MiniLM-L6-v2")

def chunk_text(text: str, words_per_chunk: int = 250) -> List[str]:
    words = text.split()
    return [" ".join(words[i:i+words_per_chunk]) for i in range(0, len(words), words_per_chunk)]

def build_rag_index(subject_book_pdf: str):
    sb_text = read_pdf_text(subject_book_pdf)
    chunks = chunk_text(sb_text, 250)
    if not chunks:
        chunks = [sb_text]
    vecs = EMBED.encode(chunks)
    index = faiss.IndexFlatL2(vecs.shape[1])
    index.add(np.array(vecs))
    return chunks, index

def retrieve_context(query: str, chunks: List[str], index, k: int = 3) -> str:
    qv = EMBED.encode([query])
    D, I = index.search(np.array(qv), k)
    selected = [chunks[i] for i in I[0]]
    return "\n".join(selected)

def grade_one_question(item: Dict[str, Any], context: str) -> Dict[str, Any]:
    """
    item: {part,qno,marks,question,student_answer,student_diagrams}
    Returns: { ... , score, feedback }
    """
    prompt = (
        "You are an expert teacher. Grade the student's answer using the reference context from the textbook. "
        "Be concise and fair.\n\n"
        f"Question (Part {item['part']}): Q{item['qno']} ({item['marks']} marks)\n"
        f"{item['question']}\n\n"
        "Textbook Context (relevant excerpts):\n"
        f"{context}\n\n"
        "Student Answer:\n"
        f"{item.get('student_answer','')}\n\n"
        "Student Diagram Description:\n"
        f"{item.get('student_diagrams','')}\n\n"
        f"Instructions:\n- Give a numeric score out of {item['marks']}.\n"
        "- Explain briefly: what is correct, what is missing, and how to improve.\n"
        "- Respond in this JSON format strictly:\n"
        '{"score": <number>, "feedback": "<one short paragraph>"}'
    )
    resp = GEMINI_MODEL.generate_content(prompt)
    raw = resp.text.strip()
    # Try to parse JSON; if fail, try to extract number + feedback heuristically
    score = 0.0
    feedback = raw
    try:
        data = json.loads(raw)
        score = float(data.get("score", 0))
        feedback = str(data.get("feedback","")).strip() or raw
    except Exception:
        # Heuristic score grab
        ms = re.search(r"(\d+(\.\d+)?)", raw)
        if ms: score = float(ms.group(1))
    # Clamp score between 0 and marks
    score = max(0.0, min(score, float(item["marks"])))
    out = dict(item)
    out.update({"score": score, "feedback": feedback})
    return out

def grade_all(mapped_items: List[Dict[str, Any]], chunks: List[str], index) -> List[Dict[str, Any]]:
    graded = []
    for it in mapped_items:
        # retrieve context using the actual question text
        ctx = retrieve_context(it["question"], chunks, index, k=3)
        graded.append(grade_one_question(it, ctx))
    return graded


In [8]:
# ============================================================
# Agent 4 — Report Generator (per-student PDF)
# ============================================================
def generate_student_report(student_name: str, graded_items: List[Dict[str, Any]], out_dir="reports"):
    os.makedirs(out_dir, exist_ok=True)
    pdf_path = os.path.join(out_dir, f"{student_name}.pdf")
    pdf = canvas.Canvas(pdf_path, pagesize=A4)
    pdf.setTitle(f"Scribify AI — {student_name}")

    # Header
    pdf.setFont("Helvetica-Bold", 16)
    pdf.drawString(1*inch, 11*inch, f"Scribify AI Report — {student_name}")

    y = 10.6*inch
    total_scored = 0.0
    total_marks = 0.0

    # Sort by part, then qno
    graded_items = sorted(graded_items, key=lambda x: (x["part"], x["qno"]))

    current_part = None
    for gi in graded_items:
        if y < 1.5*inch:
            pdf.showPage()
            pdf.setFont("Helvetica-Bold", 16)
            pdf.drawString(1*inch, 11*inch, f"Scribify AI Report — {student_name}")
            y = 10.6*inch

        # Section per Part
        if gi["part"] != current_part:
            current_part = gi["part"]
            pdf.setFont("Helvetica-Bold", 13)
            pdf.drawString(1*inch, y, f"{current_part}")
            y -= 0.25*inch

        # Question line
        pdf.setFont("Helvetica-Bold", 11)
        pdf.drawString(1*inch, y, f"Q{gi['qno']}  [{gi['marks']} marks] — Score: {gi['score']:.1f}")
        y -= 0.2*inch

        # Feedback
        pdf.setFont("Helvetica", 10)
        # Wrap feedback roughly
        fb = gi.get("feedback","").replace("\n", " ").strip()
        wrapped = []
        line = ""
        for word in fb.split():
            if len(line) + len(word) + 1 < 110:
                line = f"{line} {word}".strip()
            else:
                wrapped.append(line)
                line = word
        if line: wrapped.append(line)
        for w in wrapped[:8]:  # limit for page
            pdf.drawString(1*inch, y, w)
            y -= 0.18*inch
        y -= 0.08*inch

        total_scored += float(gi["score"])
        total_marks += float(gi["marks"])

    # Totals
    if y < 1.2*inch:
        pdf.showPage()
        y = 10.6*inch
    pdf.setFont("Helvetica-Bold", 12)
    pdf.drawString(1*inch, y, f"Total: {total_scored:.1f} / {total_marks:.1f}")
    pdf.save()
    return pdf_path

In [9]:
# ============================================================
# End-to-End Driver
# ============================================================
def run_scribify_pipeline(
    question_paper_pdf: str = "data/question_paper.pdf",
    subject_book_pdf: str = "data/subject_book.pdf",
    students_glob: str = "data/students/*.pdf",
    limit_questions: int = None  # e.g., 5 to cap during demo
):
    # Build RAG index for subject book once
    print("🔧 Building RAG index from subject book...")
    chunks, index = build_rag_index(subject_book_pdf)
    print(f"   -> Chunks in KB: {len(chunks)}")

    # Parse question paper to schema
    print("🧭 Parsing question paper...")
    schema = parse_question_paper_to_schema(question_paper_pdf)
    if limit_questions:
        schema = schema[:limit_questions]
    print(f"   -> Parsed {len(schema)} questions")

    # Iterate over students
    reports = []
    for student_pdf in glob.glob(students_glob):
        student_name = Path(student_pdf).stem
        print(f"\n📄 Student: {student_name}")

        # Agent 1: extract text + diagrams per page, then merge
        pages_json = extract_student_pages_json(student_pdf)
        merged = merge_student_pages(pages_json)

        # Agent 2: map student's merged content to question schema
        mapped = map_student_answers_to_questions(merged, schema)

        # Agent 3: grade all mapped questions using RAG + Gemini
        graded = grade_all(mapped, chunks, index)

        # Agent 4: generate PDF report
        pdf_path = generate_student_report(student_name, graded)
        print(f"✅ Report: {pdf_path}")
        reports.append(pdf_path)
    return reports

# ============================================================
# RUN (uncomment to execute after you add your PDFs & API key)
# ============================================================
reports = run_scribify_pipeline(
    question_paper_pdf="data/question_paper.pdf",
    subject_book_pdf="data/subject_book.pdf",
    students_glob="data/students/*.pdf",
    limit_questions= None   # set None to grade all questions
)
print("All reports:", reports)

🔧 Building RAG index from subject book...
   -> Chunks in KB: 540
🧭 Parsing question paper...


E0000 00:00:1761285249.857578 1396299 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


   -> Parsed 5 questions

📄 Student: Dharun355
✅ Report: reports/Dharun355.pdf
All reports: ['reports/Dharun355.pdf']
