# Setup
This first cell is importing all the necessary packages and setting up the environment; including loading the API key.

In [23]:
!pip -q install google-generativeai pdfplumber pandas tqdm tenacity

import os
import json
import re
from dataclasses import dataclass
from typing import Dict, List, Any, Optional, Tuple

import pandas as pd
import pdfplumber
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

import google.generativeai as genai

# ---- Config ----
MODEL_NAME = "gemini-2.5-flash-lite"  # easy to swap later
TEMPERATURE = 1.0
MAX_OUTPUT_TOKENS = 256

from google.colab import userdata

api_key = userdata.get("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Missing GEMINI_API_KEY")

genai.configure(api_key=api_key)
model = genai.GenerativeModel(MODEL_NAME)

print("‚úÖ Setup complete. Model:", MODEL_NAME)

‚úÖ Setup complete. Model: gemini-2.5-flash-lite


In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Text Samples
This cell is loading into the envrontment all of the written text samples needed. This includes the ones used as grade examples for few-shot testing.

In [25]:
FEW_SHOT_PDFS = {
    "good": "good_example.pdf",  # 10/10
    "mid":  "mid_example.pdf",   # 5/10
    "bad":  "bad_example.pdf",   # 1/10
}

SAMPLE_PDFS = [
    "sample1.pdf",
    "sample2.pdf",
    "sample3.pdf",
    "sample4.pdf",
    "sample5.pdf",
    "sample6.pdf",
    "sample7.pdf",
    "sample8.pdf",
    "bad_example_copy.pdf",
    "mid_example_copy.pdf"
]

def pdf_to_text(pdf_path: str, max_chars: int = 20000) -> str:
    """
    Extracts text from a PDF using pdfplumber. No OCR.
    max_chars limits prompt size to keep calls stable/cheap.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path} (cwd={os.getcwd()})")

    chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text() or ""
            if txt.strip():
                chunks.append(txt.strip())

    full = "\n\n".join(chunks).strip()
    if not full:
        full = ""

    if len(full) > max_chars:
        full = full[:max_chars] + "\n\n[TRUNCATED]"

    return full

missing = [p for p in list(FEW_SHOT_PDFS.values()) + SAMPLE_PDFS if not os.path.exists(p)]
if missing:
    print("‚ö†Ô∏è Missing PDFs (upload these into Colab /content):")
    for m in missing:
        print(" -", m)
else:
    print("‚úÖ All PDFs found.")


‚úÖ All PDFs found.


In [26]:
few_shot_text = {}
for k, path in FEW_SHOT_PDFS.items():
    few_shot_text[k] = pdf_to_text(path)

sample_texts = {}
for path in SAMPLE_PDFS:
    sample_texts[path] = pdf_to_text(path)

def summarize_lengths():
    rows = []
    for k in ["good", "mid", "bad"]:
        rows.append({"name": f"{k}_anchor", "pdf": FEW_SHOT_PDFS[k], "chars": len(few_shot_text[k])})
    for s in SAMPLE_PDFS:
        rows.append({"name": s, "pdf": s, "chars": len(sample_texts[s])})
    return pd.DataFrame(rows)

display(summarize_lengths())


Unnamed: 0,name,pdf,chars
0,good_anchor,good_example.pdf,4105
1,mid_anchor,mid_example.pdf,1006
2,bad_anchor,bad_example.pdf,926
3,sample1.pdf,sample1.pdf,1813
4,sample2.pdf,sample2.pdf,3331
5,sample3.pdf,sample3.pdf,1936
6,sample4.pdf,sample4.pdf,1384
7,sample5.pdf,sample5.pdf,2000
8,sample6.pdf,sample6.pdf,2113
9,sample7.pdf,sample7.pdf,1956


In [27]:
RUBRIC = """You are a college-level writing instructor grading a student reflection essay.

Grade holistically using these dimensions:
1) Flow (clarity/readability across sentences)
2) Transitions (connections between ideas/paragraphs)
3) Content Quality & Focus (relevance, specificity, alignment to the week‚Äôs concepts)
4) Spelling & Grammar (correctness, professionalism)
5) Knowledge & Depth (understanding + thoughtful engagement)
6) Structure (organization, paragraphing, logical progression)

Score rules:
- Provide ONE overall score from 1 to 10 (integers only).
- 1 = fundamentally flawed; minimal understanding; very poor writing.
- 5 = mixed/adequate; some understanding; clear weaknesses.
- 10 = exemplary; polished, insightful, well-structured; no meaningful weaknesses.
- Avoid score inflation: 9‚Äì10 only if nearly flawless.
"""

JSON_CONTRACT = """Output STRICT JSON ONLY (no markdown, no extra text).

Return a JSON array with EXACTLY one object per essay, in the SAME ORDER as the essays are presented.

Each object must match:
{
  "id": "<string>",
  "pred_score": <integer 1-10>,
  "justification": "<2-4 sentences>"
}
"""


def build_fewshot_prompt_multi(anchor_good: str, anchor_mid: str, anchor_bad: str,
                               essays: List[Tuple[str, str]]) -> str:
    essays_block = []
    for essay_id, essay_text in essays:
        essays_block.append(
            f"[ESSAY]\nID: {essay_id}\nTEXT:\n{essay_text}\n"
        )
    essays_block = "\n".join(essays_block)

    return f"""{RUBRIC}

Calibration anchors (use these to calibrate your scoring scale):

[ANCHOR A ‚Äî 10/10 EXAMPLE]
{anchor_good}

[ANCHOR B ‚Äî 5/10 EXAMPLE]
{anchor_mid}

[ANCHOR C ‚Äî 1/10 EXAMPLE]
{anchor_bad}

Now grade EACH essay below independently.

IMPORTANT RULES:
- Return a JSON ARRAY only.
- The array must have EXACTLY {len(essays)} items.
- Items must be in the SAME ORDER as the essays appear below.
- Use the essay ID exactly as provided.
- pred_score must be an integer 1‚Äì10.

Essays (in order):
{essays_block}

{JSON_CONTRACT}
"""

def build_zeroshot_prompt_multi(essays: List[Tuple[str, str]]) -> str:
    essays_block = []
    for essay_id, essay_text in essays:
        essays_block.append(
            f"[ESSAY]\nID: {essay_id}\nTEXT:\n{essay_text}\n"
        )
    essays_block = "\n".join(essays_block)

    return f"""{RUBRIC}

Now grade EACH essay below independently.

IMPORTANT RULES:
- Return a JSON ARRAY only.
- The array must have EXACTLY {len(essays)} items.
- Items must be in the SAME ORDER as the essays appear below.
- Use the essay ID exactly as provided.
- pred_score must be an integer 1‚Äì10.

Essays (in order):
{essays_block}

{JSON_CONTRACT}
"""

In [28]:
class GeminiJSONError(Exception):
    pass

def _extract_json_payload(text: str) -> str:
    """
    Extract first JSON array payload from the model output.
    """
    text = text.strip()

    # If already pure JSON array
    if text.startswith("[") and text.endswith("]"):
        return text

    # Try to find a JSON array substring
    m = re.search(r"\[.*\]", text, flags=re.DOTALL)
    if m:
        return m.group(0).strip()

    raise GeminiJSONError("Could not find a JSON array in model output.")

def _validate_and_normalize(obj: Dict[str, Any], expected_id: str) -> Dict[str, Any]:
    if "id" not in obj or "pred_score" not in obj or "justification" not in obj:
        raise GeminiJSONError(f"Missing required keys in JSON: {obj.keys()}")

    # Normalize id to string
    obj["id"] = str(obj["id"])

    # Enforce expected id to avoid mismaps
    if obj["id"] != expected_id:
        # Don‚Äôt fail hard; just override to keep alignment
        obj["id_original"] = obj["id"]
        obj["id"] = expected_id

    # Score: int 1‚Äì10
    score = obj["pred_score"]
    if isinstance(score, str) and score.strip().isdigit():
        score = int(score.strip())
    elif isinstance(score, (float, int)):
        score = int(round(score))
    else:
        raise GeminiJSONError(f"pred_score is not numeric: {score}")

    # Clamp + record if clamped
    clamped = False
    if score < 1:
        score = 1
        clamped = True
    if score > 10:
        score = 10
        clamped = True

    obj["pred_score"] = score
    obj["clamped"] = clamped

    # Justification: short string
    obj["justification"] = str(obj["justification"]).strip()
    if len(obj["justification"]) == 0:
        obj["justification"] = "No justification provided."

    return obj

@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=2, max=20),
    retry=retry_if_exception_type((GeminiJSONError, TimeoutError, ConnectionError, RuntimeError))
)

def _validate_and_normalize_list(arr: Any, expected_ids: List[str]) -> List[Dict[str, Any]]:
    if not isinstance(arr, list):
        raise GeminiJSONError(f"Expected a JSON array, got: {type(arr)}")

    if len(arr) != len(expected_ids):
        raise GeminiJSONError(f"Expected {len(expected_ids)} items, got {len(arr)}")

    out = []
    for i, (obj, expected_id) in enumerate(zip(arr, expected_ids)):
        if not isinstance(obj, dict):
            raise GeminiJSONError(f"Item {i} is not an object: {obj}")
        out.append(_validate_and_normalize(obj, expected_id=expected_id))
    return out

@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=2, max=20),
    retry=retry_if_exception_type((GeminiJSONError, TimeoutError, ConnectionError, RuntimeError))
)
def grade_many_with_gemini(prompt: str, expected_ids: List[str]) -> List[Dict[str, Any]]:
    """
    One API call that returns a JSON array with one result per expected id.
    """
    try:
        resp = model.generate_content(
            prompt,
            generation_config={
                "temperature": TEMPERATURE,
                "max_output_tokens": 4096,  # more room for multiple essays output
            },
        )
        raw = (resp.text or "").strip()

        json_str = _extract_json_payload(raw)
        payload = json.loads(json_str)

        arr = _validate_and_normalize_list(payload, expected_ids=expected_ids)

        # Attach raw output for debugging
        for obj in arr:
            obj["raw_json"] = json_str
            obj["raw_text"] = raw
        return arr

    except Exception as e:
        raise RuntimeError(str(e))

In [29]:
def run_grading(
    sample_names: List[str],
    sample_texts_map: Dict[str, str],
    mode: str,              # "fewshot" or "zeroshot"
    order: str,             # "original" or "reversed"
    calib: Optional[Dict[str, str]] = None,
) -> pd.DataFrame:
    assert mode in ("fewshot", "zeroshot")
    assert order in ("original", "reversed")

    seq = list(sample_names)
    if order == "reversed":
        seq = list(reversed(seq))

    essays = [(name, sample_texts_map.get(name, "")) for name in seq]
    expected_ids = [name for name in seq]

    if mode == "fewshot":
        prompt = build_fewshot_prompt_multi(
            anchor_good=calib["good"],
            anchor_mid=calib["mid"],
            anchor_bad=calib["bad"],
            essays=essays,
        )
    else:
        prompt = build_zeroshot_prompt_multi(essays=essays)

    graded_list = grade_many_with_gemini(prompt, expected_ids=expected_ids)

    rows = []
    for idx, obj in enumerate(graded_list):
        rows.append({
            "id": obj["id"],
            "sample_name": obj["id"],
            "position_in_run": idx,
            "mode": mode,
            "order": order,
            "pred_score": obj["pred_score"],
            "justification": obj["justification"],
            "clamped": obj.get("clamped", False),
            "raw_json": obj.get("raw_json", ""),
        })

    return pd.DataFrame(rows)

In [30]:
NON_EXAMPLES = SAMPLE_PDFS

df_few_orig = run_grading(
    sample_names=NON_EXAMPLES,
    sample_texts_map=sample_texts,
    mode="fewshot",
    order="original",
    calib=few_shot_text,
)

df_few_rev = run_grading(
    sample_names=NON_EXAMPLES,
    sample_texts_map=sample_texts,
    mode="fewshot",
    order="reversed",
    calib=few_shot_text,
)

df_zero_orig = run_grading(
    sample_names=NON_EXAMPLES,
    sample_texts_map=sample_texts,
    mode="zeroshot",
    order="original",
    calib=None,
)

df_zero_rev = run_grading(
    sample_names=NON_EXAMPLES,
    sample_texts_map=sample_texts,
    mode="zeroshot",
    order="reversed",
    calib=None,
)

display(df_few_orig.head())


Unnamed: 0,id,sample_name,position_in_run,mode,order,pred_score,justification,clamped,raw_json
0,sample1.pdf,sample1.pdf,0,fewshot,original,8,The essay demonstrates a solid understanding o...,False,"[\n {\n ""id"": ""sample1.pdf"",\n ""pred_sc..."
1,sample2.pdf,sample2.pdf,1,fewshot,original,9,"This reflection is comprehensive, detailing mu...",False,"[\n {\n ""id"": ""sample1.pdf"",\n ""pred_sc..."
2,sample3.pdf,sample3.pdf,2,fewshot,original,8,The essay effectively summarizes and explains ...,False,"[\n {\n ""id"": ""sample1.pdf"",\n ""pred_sc..."
3,sample4.pdf,sample4.pdf,3,fewshot,original,7,This reflection clearly articulates the concep...,False,"[\n {\n ""id"": ""sample1.pdf"",\n ""pred_sc..."
4,sample5.pdf,sample5.pdf,4,fewshot,original,8,The essay provides a well-structured summary a...,False,"[\n {\n ""id"": ""sample1.pdf"",\n ""pred_sc..."


In [31]:
# ============================================================
# Cell 8 ‚Äî Save outputs: JSON + CSV (scores by sample name)
# - Writes:
#   1) 4 per-condition CSVs
#   2) 1 combined CSV
#   3) 4 per-condition JSONL files (one JSON per line)
# - Prints summary stats + order-effect deltas
# ============================================================

from pathlib import Path

OUT_DIR = Path("/content/outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def save_df(df: pd.DataFrame, stem: str):
    csv_path = OUT_DIR / f"{stem}.csv"
    jsonl_path = OUT_DIR / f"{stem}.jsonl"

    # CSV: easy to inspect
    df.to_csv(csv_path, index=False)

    # JSONL: one JSON object per row
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            obj = {
                "id": row["id"],
                "sample_name": row["sample_name"],
                "position_in_run": int(row["position_in_run"]),
                "mode": row["mode"],
                "order": row["order"],
                "pred_score": int(row["pred_score"]),
                "justification": row["justification"],
                "clamped": bool(row["clamped"]),
                "raw_json": row["raw_json"],
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    return str(csv_path), str(jsonl_path)

paths = []
paths.append(save_df(df_few_orig,  "fewshot_original"))
paths.append(save_df(df_few_rev,   "fewshot_reversed"))
paths.append(save_df(df_zero_orig, "zeroshot_original"))
paths.append(save_df(df_zero_rev,  "zeroshot_reversed"))

df_all = pd.concat([df_few_orig, df_few_rev, df_zero_orig, df_zero_rev], ignore_index=True)
combined_csv, combined_jsonl = save_df(df_all, "combined_all_conditions")

print("‚úÖ Saved files:")
for p in paths:
    print(" -", p[0])
    print(" -", p[1])
print(" -", combined_csv)
print(" -", combined_jsonl)

# ---- Summary stats ----
def summarize(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby(["mode", "order"]).agg(
        n=("pred_score", "count"),
        mean=("pred_score", "mean"),
        std=("pred_score", "std"),
        min=("pred_score", "min"),
        max=("pred_score", "max"),
    ).reset_index()

display(summarize(df_all))

# ---- Order-effect comparison per id ----
def order_effect(mode: str) -> pd.DataFrame:
    a = df_all[(df_all["mode"] == mode) & (df_all["order"] == "original")][["sample_name", "pred_score"]].rename(columns={"pred_score": "score_original"})
    b = df_all[(df_all["mode"] == mode) & (df_all["order"] == "reversed")][["sample_name", "pred_score"]].rename(columns={"pred_score": "score_reversed"})
    m = a.merge(b, on="sample_name", how="inner")
    m["delta_reversed_minus_original"] = m["score_reversed"] - m["score_original"]
    return m.sort_values("sample_name")

print("üìå Few-shot order effect (reversed - original):")
display(order_effect("fewshot"))

print("üìå Zero-shot order effect (reversed - original):")
display(order_effect("zeroshot"))


‚úÖ Saved files:
 - /content/outputs/fewshot_original.csv
 - /content/outputs/fewshot_original.jsonl
 - /content/outputs/fewshot_reversed.csv
 - /content/outputs/fewshot_reversed.jsonl
 - /content/outputs/zeroshot_original.csv
 - /content/outputs/zeroshot_original.jsonl
 - /content/outputs/zeroshot_reversed.csv
 - /content/outputs/zeroshot_reversed.jsonl
 - /content/outputs/combined_all_conditions.csv
 - /content/outputs/combined_all_conditions.jsonl


Unnamed: 0,mode,order,n,mean,std,min,max
0,fewshot,original,10,7.0,2.357023,1,9
1,fewshot,reversed,10,6.9,2.330951,1,9
2,zeroshot,original,10,7.5,2.321398,2,9
3,zeroshot,reversed,10,7.8,2.149935,2,9


üìå Few-shot order effect (reversed - original):


Unnamed: 0,sample_name,score_original,score_reversed,delta_reversed_minus_original
8,bad_example_copy.pdf,1,1,0
9,mid_example_copy.pdf,5,5,0
0,sample1.pdf,8,7,-1
1,sample2.pdf,9,9,0
2,sample3.pdf,8,8,0
3,sample4.pdf,7,7,0
4,sample5.pdf,8,8,0
5,sample6.pdf,8,8,0
6,sample7.pdf,8,8,0
7,sample8.pdf,8,8,0


üìå Zero-shot order effect (reversed - original):


Unnamed: 0,sample_name,score_original,score_reversed,delta_reversed_minus_original
8,bad_example_copy.pdf,2,2,0
9,mid_example_copy.pdf,5,7,2
0,sample1.pdf,8,8,0
1,sample2.pdf,8,8,0
2,sample3.pdf,9,9,0
3,sample4.pdf,7,8,1
4,sample5.pdf,9,9,0
5,sample6.pdf,9,9,0
6,sample7.pdf,9,9,0
7,sample8.pdf,9,9,0


In [32]:
# ============================================================
# Cell 9 ‚Äî Convenience: ‚Äúscores by sample name‚Äù CSV only
# - A compact table with one row per sample, 4 columns for scores
# ============================================================

pivot = df_all.pivot_table(
    index="sample_name",
    columns=["mode", "order"],
    values="pred_score",
    aggfunc="first",
).reset_index()

# Flatten multiindex columns
pivot.columns = ["sample_name"] + [f"{c[0]}_{c[1]}" for c in pivot.columns[1:]]
display(pivot)

score_csv_path = OUT_DIR / "scores_by_sample_name.csv"
pivot.to_csv(score_csv_path, index=False)
print("‚úÖ Saved:", score_csv_path)


Unnamed: 0,sample_name,fewshot_original,fewshot_reversed,zeroshot_original,zeroshot_reversed
0,bad_example_copy.pdf,1,1,2,2
1,mid_example_copy.pdf,5,5,5,7
2,sample1.pdf,8,7,8,8
3,sample2.pdf,9,9,8,8
4,sample3.pdf,8,8,9,9
5,sample4.pdf,7,7,7,8
6,sample5.pdf,8,8,9,9
7,sample6.pdf,8,8,9,9
8,sample7.pdf,8,8,9,9
9,sample8.pdf,8,8,9,9


‚úÖ Saved: /content/outputs/scores_by_sample_name.csv


In [33]:
# ============================================================
# Cell 10 ‚Äî Optional: Main wrapper (does not auto-run)
# - Helpful if you want to re-run everything in one call later
# ============================================================

def main():
    # (Re)load texts
    calib = {k: pdf_to_text(v) for k, v in CALIBRATION_PDFS.items()}
    samples_map = {p: pdf_to_text(p) for p in SAMPLE_PDFS}

    # Run 4 conditions
    df1 = run_grading(SAMPLE_PDFS, samples_map, "fewshot", "original", calib=calib)
    df2 = run_grading(SAMPLE_PDFS, samples_map, "fewshot", "reversed", calib=calib)
    df3 = run_grading(SAMPLE_PDFS, samples_map, "zeroshot", "original")
    df4 = run_grading(SAMPLE_PDFS, samples_map, "zeroshot", "reversed")

    df_all_local = pd.concat([df1, df2, df3, df4], ignore_index=True)

    # Save
    save_df(df1, "fewshot_original")
    save_df(df2, "fewshot_reversed")
    save_df(df3, "zeroshot_original")
    save_df(df4, "zeroshot_reversed")
    save_df(df_all_local, "combined_all_conditions")

    # Compact scores CSV
    pivot_local = df_all_local.pivot_table(
        index="sample_name",
        columns=["mode", "order"],
        values="pred_score",
        aggfunc="first",
    ).reset_index()
    pivot_local.columns = ["sample_name"] + [f"{c[0]}_{c[1]}" for c in pivot_local.columns[1:]]
    pivot_local.to_csv(OUT_DIR / "scores_by_sample_name.csv", index=False)

    return df_all_local

# To run:
# df_all = main()
