In [3]:
!pip install -q google-generativeai


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
import google.generativeai as genai
GENAI_API_KEY = os.getenv('GENAI_API_KEY')

genai.configure(api_key=GENAI_API_KEY)


In [None]:
# 1) Install libs
!apt-get update -qq
!apt-get install -y -qq tesseract-ocr
!pip install --quiet google-generativeai pymupdf pytesseract Pillow regex

# 2) Imports & config
import os, json, textwrap
# from google.colab import files
import google.generativeai as genai
from PIL import Image
import pytesseract
import fitz  # pymupdf
import io, re

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
# 2) Set your Gemini (Generative AI) API key
genai.configure(api_key=GENAI_API_KEY)

In [6]:
 # 3) helper: convert first page of PDF to PIL.Image (if PDF)
def pdf_first_page_to_pil(pdf_bytes, zoom=2.0):
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    page = doc.load_page(0)
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img

# 4) OCR helper (returns text)
def ocr_bytes_to_text(file_bytes, filename):
    # detect pdf by extension
    if filename.lower().endswith(".pdf"):
        img = pdf_first_page_to_pil(file_bytes)
    else:
        img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
    # optional simple preprocessing could be added here
    text = pytesseract.image_to_string(img, lang='eng')
    return text


In [48]:
import json
import time

def call_gemini(system_prompt, user_prompt, model="gemini-2.5-flash",
                max_output_tokens=512, retries=1, truncate_to=3000):
    """
    Robust replacement for the Gemini call in Colab.
    - Returns a string: the model text on success, or a JSON-stringified error object on failure.
    - Keeps same simple call shape so you can drop it in place of your old function.
    """
    prompt_full = system_prompt + "\n\n" + user_prompt

    for attempt in range(retries + 1):
        try:
            model_instance = genai.GenerativeModel(model_name=model)
            # On retry, optionally send a truncated prompt to reduce safety/token issues
            send_prompt = prompt_full if attempt == 0 else prompt_full[:truncate_to]

            resp = model_instance.generate_content(
                send_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.0,
                    max_output_tokens=max_output_tokens
                )
            )

            # 1) Preferred fast accessor (may raise if no Part present)
            try:
                text = resp.text
                if text is not None:
                    return text
            except Exception:
                # fall through to safer inspection
                pass

            # 2) If no resp.text, check candidates array (older/newer client shapes)
            raw = getattr(resp, "_raw_response", None) or getattr(resp, "to_dict", lambda: None)()
            # If resp has candidates-like structure, try to extract first candidate text
            try:
                # safe traversal for common shapes
                candidates = raw.get("candidates") if isinstance(raw, dict) else None
                if candidates and len(candidates) > 0:
                    c0 = candidates[0]
                    # common nesting: c0['content'][0]['text']
                    if isinstance(c0, dict):
                        cont = c0.get("content")
                        if cont and isinstance(cont, list) and len(cont) > 0 and isinstance(cont[0], dict):
                            txt = cont[0].get("text")
                            if txt:
                                return txt
                        # some clients put plain text in c0.get('text')
                        if c0.get("text"):
                            return c0.get("text")
            except Exception:
                pass

            # 3) If prompt_feedback/safety exists -> return clear error JSON
            pf = getattr(resp, "prompt_feedback", None)
            if pf and getattr(pf, "safety_ratings", None):
                try:
                    ratings = pf.safety_ratings
                    safety_reason = ", ".join(f"{r.category.name}:{r.probability.name}" for r in ratings)
                except Exception:
                    safety_reason = str(pf)
                err = {"error": "blocked_by_safety", "safety_reason": safety_reason}
                print(f"[call_gemini] Warning: blocked by safety -> {safety_reason}")
                return json.dumps(err)

            # 4) Try to find any 'text' anywhere in raw response as last fallback
            def _find_text(obj):
                if isinstance(obj, dict):
                    for k, v in obj.items():
                        if k == "text" and isinstance(v, str):
                            return v
                        res = _find_text(v)
                        if res:
                            return res
                elif isinstance(obj, list):
                    for e in obj:
                        res = _find_text(e)
                        if res:
                            return res
                return None

            found = _find_text(raw) if raw is not None else None
            if found:
                return found

            # 5) Nothing usable found — either retry or return diagnostic
            if attempt < retries:
                time.sleep(1 + attempt * 2)
                continue

            # return diagnostic raw for debugging as JSON
            return json.dumps({"error": "no_content_generated", "raw_response_preview": str(raw)[:2000]})

        except Exception as exc:
            # network / API error -> retry if possible, else return error JSON
            if attempt < retries:
                time.sleep(1 + attempt * 2)
                continue
            return json.dumps({"error": "exception_calling_api", "details": str(exc)})

    # fallback (shouldn't be reached)
    return json.dumps({"error": "unknown_failure"})


In [34]:
print("Upload up to 3 files (PDF or image).")
uploaded = files.upload()
forms = {}
for i, (fname, bytes_io) in enumerate(uploaded.items()):
    if i >= 3: break
    b = bytes_io
    try:
        text = ocr_bytes_to_text(b, fname)
    except Exception as e:
        text = ""
    forms[fname] = text
    print(f"[OCR] {fname}: extracted ~{len(text.split())} words")


Upload up to 3 files (PDF or image).


Saving Gemini_Generated_Image_pb5toopb5toopb5t.png to Gemini_Generated_Image_pb5toopb5toopb5t.png
Saving sample_job_application_form.pdf to sample_job_application_form (1).pdf
[OCR] Gemini_Generated_Image_pb5toopb5toopb5t.png: extracted ~57 words
[OCR] sample_job_application_form (1).pdf: extracted ~79 words


In [56]:
import json
import textwrap

# Assumes call_gemini(system_prompt, user_prompt, model=...) is defined and robust.
# If not, use the robust replacement we created earlier.

def _label_and_truncate_forms(forms_dict, per_file_char_limit=3000):
    """
    Build labeled block for prompt. Truncate each OCR text for token safety.
    """
    parts = []
    for fname, txt in forms_dict.items():
        snippet = txt.replace("\r\n", "\n")[:per_file_char_limit]
        parts.append(f"--- FILE: {fname} ---\n{snippet}\n")
    return "\n".join(parts)

UNIFIED_SYSTEM = textwrap.dedent("""
You are a document-understanding assistant for an assignment.
You will be given multiple labeled OCR texts from uploaded forms and a user QUESTION.
Your job:
  1) Determine whether the QUESTION targets a single form (e.g., "What is Alex's name in file X?")
     or is a multi-form/horizontal question (e.g., "Which forms request > 500000?").
  2) Use ONLY the provided OCR texts. Do NOT invent information.
  3) Answer concisely. Always include provenance (file name + short snippet).
  4) Always return EXACT JSON (no extra text). See output schema below.
  5) If uncertain about numeric comparisons, set confidence to "LOW" and include raw snippet.
  6) If question is ambiguous, include a top-level "note" with a short clarifying suggestion.

OUTPUT SCHEMA (must follow exactly):
If the answer is naturally a single answer (single-form question), return an object:
{
  "mode": "single",
  "file": "<filename_or_null>",
  "answer": "<short answer or null>",
  "evidence": [ {"file":"<filename>","snippet":"<short text>"} ],
  "confidence": "HIGH|MEDIUM|LOW",
  "note": "<optional short note>"
}

If the answer is multi-form (list/aggregation), return an array of objects (one per matching file):
[
  {
    "file":"<filename>",
    "extracted": { "<field>": <value or null>, ... },
    "evidence":[ {"snippet":"<short text>"} ],
    "confidence":"HIGH|MEDIUM|LOW"
  },
  ...
]

If you cannot answer anything, return an empty array [].
""").strip()

def unified_form_query(forms_dict, question, model="gemini-2.5-flash",
                       per_file_char_limit=3000, max_output_tokens=512):
    """
    Unified query: ask question over one or many forms.
    - forms_dict: {filename: ocr_text}
    - question: user question string
    Returns parsed JSON (python object) or raw string if parsing failed.
    """
    # 1) Build labeled files block (truncated)
    labeled_block = _label_and_truncate_forms(forms_dict, per_file_char_limit=per_file_char_limit)

    # 2) Build user prompt
    user_prompt = f"""FILES:
{labeled_block}
---QUESTION---
{question}

Return JSON according to the schema in system prompt. Return JSON only."""

    # 3) Call Gemini (uses your call_gemini wrapper)
    raw_out = call_gemini(UNIFIED_SYSTEM, user_prompt, model=model, max_output_tokens=max_output_tokens)

    # 4) Try to parse JSON safely
    try:
        parsed = json.loads(raw_out)
        return {"success": True, "result": parsed, "raw": raw_out}
    except Exception:
        # try to find first JSON object/array substring
        try:
            start = raw_out.index("{")
            end = raw_out.rfind("}") + 1
            maybe = raw_out[start:end]
            parsed = json.loads(maybe)
            return {"success": True, "result": parsed, "raw": raw_out}
        except Exception:
            # Not parseable: return raw and retrieved forms for debugging
            return {"success": False, "error": "Could not parse LLM output as JSON", "raw": raw_out}


In [54]:
# Example sample forms (replace with OCR outputs in your real pipeline)
forms = {
  "job_001.pdf": """Job Application Form
Full Name: Alex Johnson
DOB: 14-Mar-1996
Email: alex.johnson@example.com
Skills: Python, React, SQL
Signature: __________________""",

  "loan_001.pdf": """Loan Application Form
Name: Aisha Khan
DOB: 11-Feb-1994
Income: 600000
Loan Requested: 750000
Purpose: Start small retail business selling handicrafts.
Signature: [scanned missing]""",

  "admit_001.pdf": """Admission Form
Student Name: Meera Joshi
DOB: 02-Feb-2002
Program: MSc Computer Science
Marks: 85%
Signature: Present"""
}


In [None]:
q1 = "What is the applicant's name in loan_001.pdf?"
out1 = unified_form_query(forms, q1)
print(json.dumps(out1, indent=2))


In [58]:
q2 = "Summarize loan_001.pdf in one short sentence (include name, income, requested loan, any missing items)."
out2 = unified_form_query(forms, q2)
print(json.dumps(out2, indent=2))


{
  "success": true,
  "result": {
    "mode": "single",
    "file": "loan_001.pdf",
    "answer": "Aisha Khan, with an income of 600000, requested a loan of 750000, and her signature is missing.",
    "evidence": [
      {
        "file": "loan_001.pdf",
        "snippet": "Name: Aisha Khan"
      },
      {
        "file": "loan_001.pdf",
        "snippet": "Income: 600000"
      },
      {
        "file": "loan_001.pdf",
        "snippet": "Loan Requested: 750000"
      },
      {
        "file": "loan_001.pdf",
        "snippet": "Signature: [scanned missing]"
      }
    ],
    "confidence": "HIGH"
  },
  "raw": "```json\n{\n  \"mode\": \"single\",\n  \"file\": \"loan_001.pdf\",\n  \"answer\": \"Aisha Khan, with an income of 600000, requested a loan of 750000, and her signature is missing.\",\n  \"evidence\": [\n    {\n      \"file\": \"loan_001.pdf\",\n      \"snippet\": \"Name: Aisha Khan\"\n    },\n    {\n      \"file\": \"loan_001.pdf\",\n      \"snippet\": \"Income: 600000\"\

In [59]:
q3 = "List forms where loan requested is greater than 500000 and signature appears missing. For each, return filename, loan_requested, and evidence."
out3 = unified_form_query(forms, q3)
print(json.dumps(out3, indent=2))


{
  "success": true,
  "result": {
    "file": "loan_001.pdf",
    "extracted": {
      "loan_requested": 750000
    },
    "evidence": [
      {
        "snippet": "Loan Requested: 750000"
      },
      {
        "snippet": "Signature: [scanned missing]"
      }
    ],
    "confidence": "HIGH"
  },
  "raw": "```json\n[\n  {\n    \"file\": \"loan_001.pdf\",\n    \"extracted\": {\n      \"loan_requested\": 750000\n    },\n    \"evidence\": [\n      {\n        \"snippet\": \"Loan Requested: 750000\"\n      },\n      {\n        \"snippet\": \"Signature: [scanned missing]\"\n      }\n    ],\n    \"confidence\": \"HIGH\"\n  }\n]\n```"
}
