In [None]:
!pip install gradio google-generativeai ddgs pypdf reportlab



In [None]:
import os
import time
import requests
import io
import re
import json
import gradio as gr
import google.generativeai as genai
from ddgs import DDGS
from pypdf import PdfReader

# ReportLab imports for PDF generation
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT
from reportlab.lib import colors

In [None]:
 #--- CONFIGURATION ---#
# NOTE: Set your API key here or use an input field in the UI
# os.environ["GEMINI_API_KEY"] = "YOUR_API_KEY_HERE"

def configure_gemini(api_key):
    if not api_key:
        return False
    genai.configure(api_key=api_key)
    return True


In [None]:

# --- HELPER FUNCTIONS ---

def parse_pdf(file_obj_or_path):
    """Reads text from a PDF file object or path."""
    try:
        if file_obj_or_path is None:
            return ""
        reader = PdfReader(file_obj_or_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"


In [None]:
def create_styled_pdf(content, filename, subject, university):
    """
    Converts the text content into a formatted PDF using ReportLab.
    """
    doc = SimpleDocTemplate(filename, pagesize=letter,
                            rightMargin=72, leftMargin=72,
                            topMargin=72, bottomMargin=18)

    styles = getSampleStyleSheet()
    # Custom Styles
    styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY))

    # Title Style
    title_style = styles["Title"]
    title_style.textColor = colors.darkblue

    # Header Style (for Modules)
    header_style = ParagraphStyle(
        'ModuleHeader',
        parent=styles['Heading2'],
        fontSize=14,
        spaceAfter=12,
        spaceBefore=20,
        textColor=colors.black
    )
    # Question Style
    question_style = ParagraphStyle(
        'Question',
        parent=styles['BodyText'],
        fontSize=11,
        leading=14,
        spaceAfter=8,
        bulletIndent=10,
        leftIndent=20
    )
    story = []
    # Add Title Page details
    story.append(Paragraph(f"Question Bank: {subject}", title_style))
    story.append(Paragraph(f"University: {university}", styles["Normal"]))
    story.append(Spacer(1, 24))
    # Parse the text content line by line to format it
    lines = content.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check for Headers (lines starting with #)
        if line.startswith('#'):
            clean_line = line.replace('#', '').strip()
            story.append(Paragraph(clean_line, header_style))
        # Check for Questions (lines starting with * or -)
        elif line.startswith('*') or line.startswith('-'):
            clean_line = line.replace('*', '').replace('-', '').strip()
            # Highlight "Important" or "Repeated" questions
            if "IMPORTANT" in clean_line or "Repeated" in clean_line:
                # Add Bold HTML tags for ReportLab
                clean_line = f"<b>{clean_line}</b>"
            story.append(Paragraph(f"â€¢ {clean_line}", question_style))
        # Normal text
        else:
            story.append(Paragraph(line, styles['Normal']))
    doc.build(story)
        return filename


In [None]:
def search_question_papers(subject, university):
    """
    Searches for previous year question papers using DuckDuckGo.
    Returns a list of PDF URLs.
    """
    query = f"{university} {subject} previous year question paper filetype:pdf"
    print(f"Agent searching for: {query}")
    pdf_links = []
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=15))
            for r in results:
                if r['href'].endswith('.pdf'):
                    pdf_links.append(r['href'])
    except Exception as e:
        print(f"Search failed: {e}")
        return []
    return pdf_links[:5]

def download_and_parse_pdf_url(url):
    """Downloads a PDF from a URL and extracts text."""
    try:
        print(f"Downloading: {url}")
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            f = io.BytesIO(response.content)
            reader = PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        print(f"Failed to read {url}: {e}")
    return ""

In [None]:
def identify_questions_gemini(text_chunk):
    """Uses Gemini to extract questions from raw text chunk."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    prompt = f"""
    You are an exam paper parser. Extract all exam questions from the following text.
    Ignore instructions, marks, headers, or watermarks. Just list the questions clearly.

    Text Chunk:
    {text_chunk[:15000]}
    """
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return ""

def segregate_and_deduplicate(all_questions_text, syllabus_text):
    """
    The Core Brain:
    1. Segregates questions by Module based on Syllabus.
    2. Identifies duplicates (Important questions).
    """
    model = genai.GenerativeModel('gemini-2.5-flash')

    prompt = f"""
    I have a list of raw questions extracted from multiple papers and a syllabus text.
    Your goal is to organize these questions into a structured document.
    Task 1: Analyize the Syllabus text to understand the Modules.
    Task 2: Classify each extracted question into the correct Module.
    Task 3: If a question is repeated (semantically similar), merge them and mark it as 'IMPORTANT' (e.g., "Repeated 3 times").
    Task 4: Output the result as a clean, readable text document.
    Syllabus Text:
    {syllabus_text[:5000]}

    Raw Questions Extracted:
    {all_questions_text}
    Output Format:
    Use Markdown headers strictly.
    # [Module Name]
    * [Question Text] - (Status: Unique/Important/Repeated X times)
    * [Question Text]
    ...
    """
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"AI processing failed: {str(e)}"

In [None]:
# --- MAIN AGENT WORKFLOW ---
def run_research_agent(api_key, university, subject, syllabus_pdf):
    if not configure_gemini(api_key):
        return "Please enter a valid API Key.", None
    status_log = " Research Agent Started...\n"
    yield status_log, None

    # 1. Parse Syllabus
    if syllabus_pdf is None:
        status_log += " No Syllabus PDF provided.\n"
        yield status_log, None
        return

    status_log += "ðŸ“„ Parsing Syllabus PDF...\n"
    yield status_log, None
    syllabus_text = parse_pdf(syllabus_pdf)

    if len(syllabus_text) < 50:
        status_log += " Could not read text from Syllabus PDF.\n"
        yield status_log, None
        return

    # 2. Search Online for Question Papers
    status_log += f" Searching web for '{university} {subject}' question papers...\n"
    yield status_log, None
    links = search_question_papers(subject, university)

    if not links:
        status_log += " No PDF links found online. Try specific keywords.\n"
        yield status_log, None
        return


    status_log += f" Found {len(links)} potential PDFs. Downloading & Scanning...\n"
    yield status_log, None
    combined_questions_text = ""
    valid_pdfs_count = 0

    for link in links:
        text = download_and_parse_pdf_url(link)
        if len(text) > 200:
            valid_pdfs_count += 1
            status_log += f"  - Extracted text from: {link}\n"
            yield status_log, None
            questions = identify_questions_gemini(text)
            combined_questions_text += questions + "\n"
        else:
            status_log += f"  - Skipped (unreadable/empty): {link}\n"
            yield status_log, None
    if valid_pdfs_count == 0:
        status_log += " Failed to download or read any Question Papers.\n"
        yield status_log, None
        return

    status_log += " Analyzing questions against Syllabus & Segregating (Gemini)...\n"
    yield status_log, None

    # 3. Final Segregation & PDF Creation
    final_text_content = segregate_and_deduplicate(combined_questions_text, syllabus_text)

    status_log += "Formatting PDF Document...\n"
    yield status_log, None

    # Create PDF
    pdf_filename = "Compiled_Question_Bank.pdf"
    create_styled_pdf(final_text_content, pdf_filename, subject, university)

    status_log += "\nðŸŽ‰ COMPLETED! Your PDF document is ready to download below."
    yield status_log, pdf_filename

# --- UI LAYOUT ---

with gr.Blocks(title="AI Research Agent") as app:
    gr.Markdown("# ðŸŽ“ AI Question Paper Research Agent")
    gr.Markdown("Upload your Syllabus, enter the subject details, and the agent will scour the web for past papers and compile a professional PDF Question Bank.")

    with gr.Row():
        api_input = gr.Textbox(label="Enter Google Gemini API Key", type="password")
    with gr.Row():
        with gr.Column():
            uni_input = gr.Textbox(label="University Name", placeholder="e.g., Pune University")
            sub_input = gr.Textbox(label="Subject", placeholder="e.g., Operating Systems")
            syllabus_file = gr.File(label="Upload Syllabus (PDF)", file_types=[".pdf"])
            research_btn = gr.Button(" Find & Compile Questions", variant="primary")

        with gr.Column():
            log_output = gr.Textbox(label="Agent Status Logs", lines=15)
            # CHANGED: File output for PDF download
            file_output = gr.File(label="Download Compiled PDF")

    research_btn.click(
        run_research_agent,
        inputs=[api_input, uni_input, sub_input, syllabus_file],
        outputs=[log_output, file_output]
    )

app.launch(debug=True)