In [1]:
import csv, re, math
from collections import Counter, defaultdict
import pandas as pd
from statistics import mean

In [2]:
RESULTS_IN = "/content/results_uf.csv"          # from run_eval2(...)
ROW_SCORED_NOISY_OUT = "/content/results_scored.csv"
SUMMARY_OUT = "/content/noise_metrics_with_degradation.csv"
HEATMAP_ACC_OUT = "/content/heatmap_accuracy.csv"
HEATMAP_F1_OUT = "/content/heatmap_token_f1.csv"

In [3]:
BASELINE_INPUT_CSV = "/content/clean_prompts.csv"
CLEAN_RESULTS_CSV   = "/content/clean_results.csv"
NOISY_RESULTS_CSV  = "/content/results_uf.csv"

In [4]:
def s_norm(s: str) -> str:
    if s is None: return ""
    s = s.strip().lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[\"\'`]', '', s)
    return s

def s_tokens(s: str):
    x = s_norm(s)
    return [] if not x else x.split()

In [5]:
def s_token_prf1(gold: str, pred: str):
    g, p = s_tokens(gold), s_tokens(pred)
    if not g and not p: return (1.0, 1.0, 1.0)
    if not g or not p:  return (0.0, 0.0, 0.0)
    g_c, p_c = Counter(g), Counter(p)
    overlap = sum((g_c & p_c).values())
    prec = overlap / max(1, sum(p_c.values()))
    rec  = overlap / max(1, sum(g_c.values()))
    #print(prec)
    #print(rec)
    f1   = 0.0 if (prec + rec) == 0 else 2*prec*rec/(prec+rec)
    #print(f1)
    return (prec, rec, f1)

In [6]:
def eval_enr(df: pd.DataFrame) -> pd.DataFrame:
    """Recompute clean scalar metrics (EM, accuracy, token_f1) from gold/pred."""
    df = df.copy()
    # unify cols
    if "gold" not in df.columns and "gold_answer" in df.columns:
        df["gold"] = df["gold_answer"]
    for col in ["gold", "pred"]:
        if col not in df.columns:
            raise RuntimeError(f"Missing required column '{col}' in clean_results.csv")
        df[col] = df[col].fillna("").astype(str)

    # normalized strings
    df["gold_norm"] = df["gold"].apply(s_norm)
    df["pred_norm"] = df["pred"].apply(s_norm)

    # strict EM / accuracy (scalars)
    df["em"] = (df["gold_norm"] == df["pred_norm"]).astype(int)
    df["accuracy"] = df["em"].astype(float)

    # token-level F1 (scalar)
    results = [s_token_prf1(g, p) for g, p in zip(df["gold"], df["pred"])]
    f_1_scores = [r[2] for r in results]
    df["token_f1"] = f_1_scores

    # ensure numeric dtype
    for col in ["em", "accuracy", "token_f1"]:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)

    return df

def pct_drop(baseline: float, val: float) -> float:
    if baseline == 0 or baseline is None:
        return float("nan")
    return 100.0 * (baseline - val) / baseline

In [7]:
clean_df_raw = pd.read_csv(CLEAN_RESULTS_CSV, dtype=str).fillna("")
clean_df = eval_enr(clean_df_raw)

baseline_em  = clean_df["em"].mean()
baseline_acc = clean_df["accuracy"].mean()
baseline_f1  = clean_df["token_f1"].mean()
baseline_n   = len(clean_df)

print(f"[CLEAN] n={baseline_n}  EM/Acc={baseline_em*100:.2f}%  TokenF1={baseline_f1*100:.2f}%")


[CLEAN] n=50  EM/Acc=60.00%  TokenF1=70.45%


In [8]:
#Load & score NOISY results
noisy_df_raw = pd.read_csv(NOISY_RESULTS_CSV, dtype=str).fillna("")
noisy_df = eval_enr(noisy_df_raw)

# Save enriched noisy rows (deliverable)
noisy_df.to_csv(ROW_SCORED_NOISY_OUT, index=False)
print(f"Enriched noisy rows → {ROW_SCORED_NOISY_OUT}")

Enriched noisy rows → /content/results_scored.csv


In [9]:
group_cols = ["noise_type", "noise_level"]
agg = noisy_df.groupby(group_cols).agg(
    n=("id", "count"),
    em=("em", "mean"),
    accuracy=("accuracy", "mean"),
    token_f1=("token_f1", "mean"),
).reset_index()


In [10]:
#Degradation vs. CLEAN baseline
agg["drop_abs_em_pts"]  = (baseline_em  - agg["em"]) * 100.0
agg["drop_pct_em"]      = agg["em"].apply(lambda v: pct_drop(baseline_em, v))

agg["drop_abs_acc_pts"] = (baseline_acc - agg["accuracy"]) * 100.0
agg["drop_pct_acc"]     = agg["accuracy"].apply(lambda v: pct_drop(baseline_acc, v))

agg["drop_abs_f1_pts"]  = (baseline_f1  - agg["token_f1"]) * 100.0
agg["drop_pct_f1"]      = agg["token_f1"].apply(lambda v: pct_drop(baseline_f1, v))

agg_sorted = agg.sort_values(group_cols)
agg_sorted.to_csv(SUMMARY_OUT, index=False)
print(f"Noise summary with degradation → {SUMMARY_OUT}")

Noise summary with degradation → /content/noise_metrics_with_degradation.csv


In [11]:
heat_acc = agg_sorted.pivot(index="noise_type", columns="noise_level", values="accuracy")
heat_f1  = agg_sorted.pivot(index="noise_type", columns="noise_level", values="token_f1")
heat_acc.to_csv(HEATMAP_ACC_OUT)
heat_f1.to_csv(HEATMAP_F1_OUT)
print(f"Heatmap tables → {HEATMAP_ACC_OUT}, {HEATMAP_F1_OUT}")

Heatmap tables → /content/heatmap_accuracy.csv, /content/heatmap_token_f1.csv


In [12]:
print("\n=== Baseline (clean) ===")
print(f"EM/Accuracy: {baseline_em*100:.2f}%   Token F1: {baseline_f1*100:.2f}%   n={baseline_n}")
print("\n=== Per-noise degradation vs. clean (abs pts / %) ===")
for _, r in agg_sorted.iterrows():
    nt, nl = str(r["noise_type"]), str(r["noise_level"])
    print(f"- {nt} / {nl}: "
          f"EM {r['drop_abs_em_pts']:.2f} pts ({r['drop_pct_em']:.1f}%), "
          f"F1 {r['drop_abs_f1_pts']:.2f} pts ({r['drop_pct_f1']:.1f}%)  [n={int(r['n'])}]")


=== Baseline (clean) ===
EM/Accuracy: 60.00%   Token F1: 70.45%   n=50

=== Per-noise degradation vs. clean (abs pts / %) ===
- emoji / heavy: EM 60.00 pts (100.0%), F1 52.14 pts (74.0%)  [n=50]
- emoji / light: EM 60.00 pts (100.0%), F1 48.77 pts (69.2%)  [n=50]
- spacing_punct / heavy: EM 56.00 pts (93.3%), F1 52.53 pts (74.6%)  [n=50]
- spacing_punct / light: EM 60.00 pts (100.0%), F1 50.09 pts (71.1%)  [n=50]
- typo / heavy: EM 60.00 pts (100.0%), F1 57.05 pts (81.0%)  [n=50]
- typo / light: EM 60.00 pts (100.0%), F1 52.81 pts (75.0%)  [n=50]


In [13]:
def normalize(x):
    if pd.isna(x):
        return ""
    s = str(x).strip()
    # Allow case-insensitive match and normalize commas/spaces
    return " ".join(s.replace("，",",").replace("’","'").replace("–","-").split()).lower()

In [59]:
# If you only have mini_intervention_items.csv, build a blank eval sheet from it:
items = pd.read_csv("/content/mini_intervention_items.csv")
eval_df = items[["item_id","condition","answer_key"]].drop_duplicates().copy()
for col in ["baseline_response","intervention_response",
            "baseline_fluency_1to5","intervention_fluency_1to5"]:
    eval_df[col] = ""
eval_df.to_csv("/content/mini_intervention_eval_template_filled.csv", index=False)
eval_df.head()


Unnamed: 0,item_id,condition,answer_key,baseline_response,intervention_response,baseline_fluency_1to5,intervention_fluency_1to5
0,Q1,L1,Mumbai,,,,
1,Q1,L2,Mumbai,,,,
2,Q1,L3,Mumbai,,,,
3,Q1,CS,Mumbai,,,,
4,Q2,L1,150,,,,


In [18]:
df = pd.read_csv("mini_intervention_eval_template_filled.csv")

df["baseline_correct"] = (df["answer_key"].map(normalize) == df["baseline_response"].map(normalize)).astype(int)
df["intervention_correct"] = (df["answer_key"].map(normalize) == df["intervention_response"].map(normalize)).astype(int)

acc_base = df["baseline_correct"].mean()
acc_int = df["intervention_correct"].mean()

flu_base = pd.to_numeric(df["baseline_fluency_1to5"], errors="coerce").mean()
flu_int = pd.to_numeric(df["intervention_fluency_1to5"], errors="coerce").mean()

print("Exact-Match Accuracy: baseline={:.3f}, intervention={:.3f}, delta={:+.3f}".format(acc_base, acc_int, acc_int-acc_base))
print("Mean Fluency (1-5):   baseline={:.2f}, intervention={:.2f}, delta={:+.2f}".format(flu_base, flu_int, flu_int-flu_base))

# Per-condition breakdown
by = df.groupby("condition").agg(
    base_acc=("baseline_correct","mean"),
    int_acc=("intervention_correct","mean"),
    base_flu=("baseline_fluency_1to5", lambda x: pd.to_numeric(x, errors='coerce').mean()),
    int_flu=("intervention_fluency_1to5", lambda x: pd.to_numeric(x, errors='coerce').mean()),
)

by["acc_delta"] = by["int_acc"] - by["base_acc"]
by["flu_delta"] = by["int_flu"] - by["base_flu"]
print("\nPer-condition breakdown:\n", by.reset_index())

FileNotFoundError: [Errno 2] No such file or directory: 'mini_intervention_eval_template_filled.csv'

In [19]:
# 1) Make sure your Ollama server cell is running (pull + serve).
# 2) Ensure the two CSVs exist (items + you'll create the filled one):
#    mini_intervention_items.csv (already provided earlier)

from patch_initial_eval_fill_intervention_v2 import run_language_intervention_eval

# Optional: override defaults via env if needed
import os
os.environ["OLLAMA_HOST"] = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
os.environ["MODEL_NAME"]  = os.environ.get("MODEL_NAME",  "llama3")
os.environ["TEMPERATURE"] = os.environ.get("TEMPERATURE", "0.2")
os.environ["MAX_TOKENS"]  = os.environ.get("MAX_TOKENS", "128")

items_csv = "/content/mini_intervention_items.csv"
out_csv   = "/content/mini_intervention_eval_template_filled.csv"

run_language_intervention_eval(items_csv, out_csv, sleep_s=0.2)
print("Filled eval written to:", out_csv)

[done] Wrote /content/mini_intervention_eval_template_filled.csv with 24 rows.
Filled eval written to: /content/mini_intervention_eval_template_filled.csv


In [24]:
import pandas as pd

path = "/content/Fluency_scores__auto-rated_.csv"
df = pd.read_csv(path)

key_map = {
    "Q1": "Mumbai",
    "Q2": "150",
    "Q3": "True",
    "Q4": "Yes",
    "Q5": "26.9",
    "Q6": "apple, banana, mango",
}

if "answer_key" not in df.columns:
    df["answer_key"] = ""

df["answer_key"] = df.apply(
    lambda r: r["answer_key"] if str(r.get("answer_key","")).strip()
    else key_map.get(str(r.get("item_id","")).upper(), ""),
    axis=1
)

df.to_csv(path, index=False)
print("answer_key added without rerunning.")

answer_key added without rerunning.


In [26]:
import pandas as pd

def normalize(x):
    if pd.isna(x):
        return ""
    s = str(x).strip()
    # normalize punctuation/spacing and lowercase
    return " ".join(s.replace("，",",").replace("’","'").replace("–","-").split()).lower()

# READ THE **FILLED** EVAL FILE, NOT the items file
df = pd.read_csv("/content/Fluency_scores__auto-rated_.csv")

# Safety: ensure required columns exist
required = {"item_id","condition","answer_key","baseline_response","intervention_response",
            "baseline_fluency_1to5","intervention_fluency_1to5"}
missing = required - set(df.columns)
assert not missing, f"Missing columns in eval CSV: {missing}. Did you load the *items* file by mistake?"

df["baseline_correct"] = (df["answer_key"].map(normalize) == df["baseline_response"].map(normalize)).astype(int)
df["intervention_correct"] = (df["answer_key"].map(normalize) == df["intervention_response"].map(normalize)).astype(int)

acc_base = df["baseline_correct"].mean()
acc_int  = df["intervention_correct"].mean()
flu_base = pd.to_numeric(df["baseline_fluency_1to5"], errors="coerce").mean()
flu_int  = pd.to_numeric(df["intervention_fluency_1to5"], errors="coerce").mean()

print("Exact-Match Accuracy: baseline={:.3f}, intervention={:.3f}, delta={:+.3f}"
      .format(acc_base, acc_int, acc_int-acc_base))
print("Mean Fluency (1–5):   baseline={:.2f}, intervention={:.2f}, delta={:+.2f}"
      .format(flu_base, flu_int,  flu_int-flu_base))

by = df.groupby("condition").agg(
    base_acc=("baseline_correct","mean"),
    int_acc =("intervention_correct","mean"),
    base_flu=("baseline_fluency_1to5", lambda x: pd.to_numeric(x, errors="coerce").mean()),
    int_flu =("intervention_fluency_1to5", lambda x: pd.to_numeric(x, errors="coerce").mean()),
)
by["acc_delta"] = by["int_acc"] - by["base_acc"]
by["flu_delta"] = by["int_flu"] - by["base_flu"]
print("\nPer-condition breakdown:\n", by.reset_index())


Exact-Match Accuracy: baseline=0.083, intervention=0.458, delta=+0.375
Mean Fluency (1–5):   baseline=2.67, intervention=4.54, delta=+1.88

Per-condition breakdown:
   condition  base_acc   int_acc  base_flu   int_flu  acc_delta  flu_delta
0        CS  0.000000  0.500000  2.500000  4.666667   0.500000   2.166667
1        L1  0.333333  0.666667  3.666667  4.666667   0.333333   1.000000
2        L2  0.000000  0.166667  2.000000  3.833333   0.166667   1.833333
3        L3  0.000000  0.500000  2.500000  5.000000   0.500000   2.500000


# ***OLLAMA***

In [14]:
# Install Ollama (Linux)
!curl -fsSL https://ollama.com/install.sh | sh

# Make a persistent models dir (inside Colab VM)
!mkdir -p /content/.ollama

# Start the server in the background and tail logs
import os, time, subprocess, textwrap, pathlib

os.environ["OLLAMA_HOST"] = "http://127.0.0.1:11434"
os.environ["OLLAMA_MODELS"] = "/content/.ollama"

# Launch server
server_cmd = "OLLAMA_MODELS=/content/.ollama nohup ollama serve > /content/ollama.log 2>&1 &"
_ = subprocess.run(server_cmd, shell=True, check=False)

# Small wait so it boots
time.sleep(2)

# Show last lines from the log (if any)
!tail -n 50 /content/ollama.log || true

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Couldn't find '/root/.ollama/id_ed25519'. Generating new private key.
Your new public key is: 

ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKR6Q8OMfTnthTJaGJ9wEyLkyHizI+XBQeUOml9cw0WF

time=2025-09-28T22:13:12.802Z level=INFO source=routes.go:1475 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:INFO OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CAC

In [15]:
# Pull a model (adjust as needed, e.g., 'llama3', 'qwen2:7b-instruct', etc.)
!OLLAMA_MODELS=/content/.ollama ollama pull llama3

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l

In [16]:
import os, time, requests, csv
from tqdm import tqdm

# Reuse the same host/env as above
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
MODEL_NAME  = os.environ.get("MODEL_NAME",  "llama3")

session = requests.Session()

def wait_for_ollama(timeout=60):
    t0 = time.time()
    while time.time() - t0 < timeout:
        try:
            r = session.get(f"{OLLAMA_HOST}/api/version", timeout=5)
            if r.ok:
                return True
        except requests.RequestException:
            pass
        time.sleep(1)
    return False

def query_llm(prompt: str):
    """Simple generate call with model defaults."""
    r = session.post(f"{OLLAMA_HOST}/api/generate",
                     json={"model": MODEL_NAME, "prompt": prompt, "stream": False},
                     timeout=120)
    r.raise_for_status()
    return r.json().get("response","").strip()

def query2_llm(prompt: str, temperature: float=0.2, num_predict: int=128, seed: int=7):
    """Generate call with options you can tune."""
    r = session.post(
        f"{OLLAMA_HOST}/api/generate",
        json={
            "model": MODEL_NAME,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": temperature,
                "num_predict": num_predict,
                "seed": seed
            }
        },
        timeout=180
    )
    r.raise_for_status()
    return r.json().get("response","").strip()

In [17]:
import json, textwrap

assert wait_for_ollama(60), "Ollama server not reachable. Check install/logs."

# Version check
ver = session.get(f"{OLLAMA_HOST}/api/version", timeout=10).json()
print("Ollama version:", ver)

# Quick test generation
out = query2_llm("Say 'hello' in one word only.", temperature=0.0, num_predict=10)
print("Test output:", out)

Ollama version: {'version': '0.12.3'}
Test output: Hello!
