## Interpolated PR-Curve + AUC (CSV # PLots for each model in 2 scenarios + Subplot for all models)

In [31]:
import json
from pathlib import Path
import matplotlib.pyplot as plt

# --- CONFIG ---
GOLD_PATH     = Path("gold_data/gold_standard_nl.json")
LISTWISE_DIR  = Path("rankings/sorted/json")
POINTWISE_DIR = Path("rankings/scored/json")
OUT_DIR       = Path("rankings/analysis/curves")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Helpers ---
def normalize(x): return str(x).strip()

def pretty_model(stem: str) -> str:
    s = stem.lower().replace("__","_").replace("-","_")
    if "gemini" in s and ("2.5" in s or "2_5") and "flash" in s: return "Gemini-2.5-flash"
    if "qwen" in s and ("235" in s or "3_235" in s or "3-235" in s): return "Qwen-3-235B"
    if ("gpt4o" in s or "gpt_4o" in s or "gpt-4o" in s) and "mini" in s: return "GPT-4o-mini"
    if ("gpt4.1" in s or "gpt_4_1" in s or "gpt-4.1" in s) and "mini" in s: return "GPT-4.1-mini"
    if "llama3" in s and ("70b" in s or "3.3" in s or "3_3" in s): return "LLaMA-3-70B"
    if ("llama4" in s or "llama_4" in s) and "scout" in s: return "LLaMA-4-Scout"
    return stem

def safe_name(name: str) -> str:
    return "".join(c if c.isalnum() or c in "-_." else "_" for c in name)

def load_gold(path: Path):
    with open(path, "r", encoding="utf-8") as f: 
        gold = json.load(f)
    return {normalize(q): set(normalize(d) for d in docs) for q, docs in gold.items()}

def load_predictions_jsonl(path: Path):
    preds = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            qid = normalize(obj["query_id"])
            ranks = [normalize(x) for x in obj["ranks"]]
            preds[qid] = ranks
    return preds

def pr_points_for_query(ranked, gold_set):
    if not gold_set: return [], []
    tp = 0
    P, R = [], []
    for i, doc in enumerate(ranked, 1):
        if doc in gold_set: tp += 1
        P.append(tp / i)
        R.append(tp / len(gold_set))
    return P, R

def interpolated_11_for_model(preds, gold):
    R_levels = [i/10 for i in range(11)]
    per_query = []
    for qid, gold_set in gold.items():
        ranked = preds.get(qid, [])
        Pq, Rq = pr_points_for_query(ranked, gold_set)
        if not Pq:
            continue
        pairs = list(zip(Rq, Pq))
        interp = []
        for r0 in R_levels:
            pmax = 0.0
            for r_i, p_i in pairs:
                if r_i >= r0 and p_i > pmax:
                    pmax = p_i
            interp.append(pmax)
        per_query.append(interp)
    if not per_query:
        return R_levels, [0.0]*11, 0.0
    mean_pts = [sum(col)/len(col) for col in zip(*per_query)]
    auc_11pt = sum(mean_pts)/len(mean_pts)
    return R_levels, mean_pts, auc_11pt

def plot_model_compare(model_name, curves_by_scenario, out_path):
    plt.figure(figsize=(10,6))
    for label, (R, P, auc) in curves_by_scenario.items():
        plt.plot(R, P, marker="o", linewidth=2, markersize=4, label=f"{label} (AUC {auc:.2f})")
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel("Recall")
    plt.ylabel("Precision (interpolated)")
    plt.title(f"Precision–Recall (11-pt) — {model_name}")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend(loc="upper right")
    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()

def write_model_csv(model_name, curves_by_scenario, out_csv: Path):
    R_levels = [f"r={i/10:.1f}" for i in range(11)]
    with open(out_csv, "w", encoding="utf-8") as f:
        f.write("model,scenario," + ",".join(R_levels) + ",AUC_11pt\n")
        for scen, (R, P, auc) in curves_by_scenario.items():
            row = [model_name, scen] + [f"{p:.4f}" for p in P] + [f"{auc:.4f}"]
            f.write(",".join(row) + "\n")

# --- Build curves for both scenarios ---
gold = load_gold(GOLD_PATH)
by_model = {}

# Listwise
for fp in sorted(LISTWISE_DIR.glob("*.jsonl")):
    model = pretty_model(fp.stem)
    preds = load_predictions_jsonl(fp)
    R, P, auc = interpolated_11_for_model(preds, gold)
    by_model.setdefault(model, {})["Listwise"] = (R, P, auc)

# Pseudo-pointwise
for fp in sorted(POINTWISE_DIR.glob("*.jsonl")):
    model = pretty_model(fp.stem)
    preds = load_predictions_jsonl(fp)
    R, P, auc = interpolated_11_for_model(preds, gold)
    by_model.setdefault(model, {})["Pseudo-pointwise"] = (R, P, auc)

# --- Per-model plots ---
for model, scen_curves in sorted(by_model.items()):
    if not scen_curves:
        continue
    fname = f"pr_interpolated_compare_{safe_name(model)}.png"
    plot_model_compare(model, scen_curves, OUT_DIR / fname)
    write_model_csv(model, scen_curves, OUT_DIR / (Path(fname).with_suffix(".csv")))

# --- Combined subplot figure ---
n_models = len(by_model)
ncols = 3
nrows = (n_models + ncols - 1) // ncols
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*5, nrows*4), squeeze=False)

for idx, (model, scen_curves) in enumerate(sorted(by_model.items())):
    ax = axes.flat[idx]
    for label, (R, P, auc) in scen_curves.items():
        ax.plot(R, P, marker="o", linewidth=2, markersize=3, label=f"{label} (AUC {auc:.2f})")
    ax.set_xlim(0,1); ax.set_ylim(0,1)
    ax.set_title(model, fontsize=10)
    ax.grid(True, linestyle="--", alpha=0.4)

    row = idx // ncols
    col = idx % ncols
    if col == 0:
        ax.set_ylabel("Precision")
    if row == nrows - 1:
        ax.set_xlabel("Recall")
    ax.legend(loc="upper right", fontsize=8)

# Hide empty subplots
for ax in axes.flat[len(by_model):]:
    ax.axis("off")

plt.tight_layout()
combined_path = OUT_DIR / "pr_interpolated_compare_all_models.png"
plt.savefig(combined_path, dpi=300)
plt.close()
print(f"Saved combined subplot figure: {combined_path}")

Saved combined subplot figure: rankings/analysis/curves/pr_interpolated_compare_all_models.png


## Plotting average PR + AUC for all models together in two scenarios for scenario comparison

In [4]:
import json
from pathlib import Path
import matplotlib.pyplot as plt

# ---------- CONFIG ----------
GOLD_PATH      = Path("gold_data/gold_standard_nl.json")
LISTWISE_DIR   = Path("rankings/sorted/json")   # listwise JSONL files
POINTWISE_DIR  = Path("rankings/scored/json")   # pseudo-pointwise JSONL files
OUT_DIR        = Path("rankings/analysis/"); OUT_DIR.mkdir(parents=True, exist_ok=True)

EXCLUDE_BASELINES = True  # set False if you want me5/jina included in the averaging

# ---------- HELPERS ----------
def normalize(x): 
    return str(x).strip()

def load_gold(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        gold = json.load(f)
    return {normalize(q): set(normalize(d) for d in docs) for q, docs in gold.items()}

def load_predictions_jsonl(path: Path):
    preds = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            qid = normalize(obj["query_id"])
            ranks = [normalize(x) for x in obj["ranks"]]
            preds[qid] = ranks
    return preds

def pr_points_for_query(ranked, gold_set):
    """Raw precision/recall as we scan down a ranked list (for one query)."""
    if not gold_set:
        return [], []
    tp = 0
    precisions, recalls = [], []
    G = len(gold_set)
    for i, doc in enumerate(ranked, 1):
        if doc in gold_set:
            tp += 1
        precisions.append(tp / i)
        recalls.append(tp / G)
    return precisions, recalls

def interpolated_11_for_model(preds, gold):
    """Return (R_levels, mean_interpolated_precision, AUC_11pt) macro-averaged across queries."""
    R = [i/10 for i in range(11)]  # 0.0 .. 1.0
    per_query = []
    for qid, gold_set in gold.items():
        ranked = preds.get(qid, [])
        P, r = pr_points_for_query(ranked, gold_set)
        if not P:
            continue
        pairs = list(zip(r, P))
        # Interpolated precision at recall r0 = max precision at any recall >= r0
        interps = []
        for r0 in R:
            pmax = 0.0
            for rq, pq in pairs:
                if rq >= r0 and pq > pmax:
                    pmax = pq
            interps.append(pmax)
        per_query.append(interps)
    if not per_query:
        return R, [0.0]*11, 0.0
    mean_pts = [sum(col)/len(col) for col in zip(*per_query)]
    auc = sum(mean_pts) / len(mean_pts)  # mean of the 11 interpolated points
    return R, mean_pts, auc

def collect_model_curves(folder: Path, gold, exclude_baselines=True):
    """Compute 11-pt curve for each file in folder. Returns list of precision arrays."""
    curves = []
    for fp in sorted(folder.glob("*.jsonl")):
        name = fp.name.lower()
        if exclude_baselines and ("me5" in name or "jina" in name):
            continue
        preds = load_predictions_jsonl(fp)
        _, P, _ = interpolated_11_for_model(preds, gold)
        curves.append(P)
    return curves

def average_curves(curve_list):
    """Column-wise average of multiple precision arrays (each length 11)."""
    if not curve_list:
        return [0.0]*11
    cols = list(zip(*curve_list))
    return [sum(c)/len(c) for c in cols]

# ---------- MAIN ----------
gold = load_gold(GOLD_PATH)

# Collect per-model curves (each a list of 11 precisions) for both scenarios
listwise_curves  = collect_model_curves(LISTWISE_DIR,  gold, exclude_baselines=EXCLUDE_BASELINES)
pointwise_curves = collect_model_curves(POINTWISE_DIR, gold, exclude_baselines=EXCLUDE_BASELINES)

R = [i/10 for i in range(11)]
avg_listwise  = average_curves(listwise_curves)
avg_pointwise = average_curves(pointwise_curves)

auc_listwise  = sum(avg_listwise)  / len(avg_listwise)
auc_pointwise = sum(avg_pointwise) / len(avg_pointwise)

# ---------- PLOT ----------
plt.figure(figsize=(9,6))
plt.plot(R, avg_listwise,  marker="o", label=f"Listwise (AUC {auc_listwise:.2f})")
plt.plot(R, avg_pointwise, marker="o", label=f"Pseudo-pointwise (AUC {auc_pointwise:.2f})")
plt.xlim(0,1); plt.ylim(0,1)
plt.xlabel("Recall")
plt.ylabel("Precision (11-pt interpolated)")
plt.title("Average Precision–Recall (11-pt) across models")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(loc="upper right")
plt.tight_layout()

avg_png = OUT_DIR / "pr_interpolated_average_listwise_vs_pointwise.png"
plt.savefig(avg_png, dpi=300)
plt.close()

# ---------- CSV ----------
avg_csv = OUT_DIR / "pr_interpolated_average_listwise_vs_pointwise.csv"
with open(avg_csv, "w", encoding="utf-8") as f:
    header = ["scenario"] + [f"r={r:.1f}" for r in R] + ["AUC_11pt"]
    f.write(",".join(header) + "\n")
    f.write(",".join(["Listwise"]        + [f"{p:.4f}" for p in avg_listwise]  + [f"{auc_listwise:.4f}"])  + "\n")
    f.write(",".join(["Pseudo-pointwise"]+ [f"{p:.4f}" for p in avg_pointwise] + [f"{auc_pointwise:.4f}"]) + "\n")

print("Saved:")
print(" -", avg_png)
print(" -", avg_csv)

Saved:
 - rankings/analysis/pr_interpolated_average_listwise_vs_pointwise.png
 - rankings/analysis/pr_interpolated_average_listwise_vs_pointwise.csv


## Extra analysis experimentation

In [5]:
import json, math
from pathlib import Path
import matplotlib.pyplot as plt

# --- CONFIG ---
GOLD_PATH = Path("gold_data/gold_standard_nl.json")
LISTWISE_DIR  = Path("rankings/sorted/json")
POINTWISE_DIR = Path("rankings/scored/json")
OUT_DIR = Path("rankings/analysis/pr_curves"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Helpers ---
def normalize(x): return str(x).strip()

def pretty_model(stem: str) -> str:
    s = stem.lower().replace("__","_").replace("-","_")
    if "gemini" in s and ("2.5" in s or "2_5") and "flash" in s: return "Gemini-2.5-flash"
    if "qwen" in s and "235" in s: return "Qwen-3-235B"
    if "gpt4o" in s and "mini" in s: return "GPT-4o-mini"
    if ("gpt4.1" in s or "gpt_4_1" in s) and "mini" in s: return "GPT-4.1-mini"
    if "llama3" in s and ("70b" in s or "3.3" in s): return "LLaMA-3-70B"
    if "llama4" in s and "scout" in s: return "LLaMA-4-Scout"
    return stem

def load_gold(path: Path):
    with open(path, "r", encoding="utf-8") as f: gold = json.load(f)
    return {normalize(q): set(normalize(d) for d in docs) for q, docs in gold.items()}

def load_predictions_jsonl(path: Path):
    preds = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            qid = normalize(obj["query_id"])
            ranks = [normalize(x) for x in obj["ranks"]]
            preds[qid] = ranks
    return preds

def pr_points_for_query(ranked, gold_set):
    """Return raw precision/recall points as we scan down the ranked list."""
    tp = 0
    precisions, recalls = [], []
    if len(gold_set) == 0:  # skip empty-gold queries
        return precisions, recalls
    for i, doc in enumerate(ranked, 1):
        if doc in gold_set: tp += 1
        precisions.append(tp / i)
        recalls.append(tp / len(gold_set))
    return precisions, recalls

def interpolated_11_for_model(preds, gold):
    """11-point interpolated precision (macro-averaged across queries)."""
    R = [i/10 for i in range(11)]  # 0.0 ... 1.0
    per_query = []
    for qid, gold_set in gold.items():
        ranked = preds.get(qid, [])
        P, r = pr_points_for_query(ranked, gold_set)
        if not P: 
            continue
        pairs = list(zip(r, P))
        # for each recall level r0, take max precision at any recall >= r0
        interps = []
        for r0 in R:
            pmax = 0.0
            for Rq, Pq in pairs:
                if Rq >= r0 and Pq > pmax:
                    pmax = Pq
            interps.append(pmax)
        per_query.append(interps)
    if not per_query:
        return R, [0.0]*11, 0.0
    # macro average across queries
    mean_pts = [sum(col)/len(col) for col in zip(*per_query)]
    auc = sum(mean_pts)/len(mean_pts)  # mean of 11 points
    return R, mean_pts, auc

def plot_curves(curves, title, out_path):
    plt.figure(figsize=(9,6))
    for model, (R, P, auc) in curves.items():
        plt.plot(R, P, label=f"{model} (AUC {auc:.2f})")
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel("Recall")
    plt.ylabel("Precision (interpolated)")
    plt.title(title)
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend(title="Models")
    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()

def write_csv(curves, out_csv: Path):
    # rows: model, r0..r10, AUC
    R_levels = [i/10 for i in range(11)]
    with open(out_csv, "w", encoding="utf-8") as f:
        header = ["model"] + [f"r={r:.1f}" for r in R_levels] + ["AUC_11pt"]
        f.write(",".join(header) + "\n")
        for model, (R, P, auc) in curves.items():
            row = [model] + [f"{p:.4f}" for p in P] + [f"{auc:.4f}"]
            f.write(",".join(row) + "\n")

# --- Run for both scenarios ---
gold = load_gold(GOLD_PATH)

for scenario_name, folder in [("Listwise", LISTWISE_DIR), ("Pseudo-pointwise", POINTWISE_DIR)]:
    curves = {}
    for fp in sorted(folder.glob("*.jsonl")):
        model = pretty_model(fp.stem)
        preds = load_predictions_jsonl(fp)
        R, P, auc = interpolated_11_for_model(preds, gold)
        curves[model] = (R, P, auc)

    if not curves:
        print(f"No JSONL files found in {folder}")
        continue

    # plot & csv
    png = OUT_DIR / f"pr_interpolated_{scenario_name.lower().replace(' ','_')}.png"
    csv = OUT_DIR / f"pr_interpolated_{scenario_name.lower().replace(' ','_')}.csv"
    plot_curves(curves, f"Precision–Recall (11‑point interpolated) — {scenario_name}", png)
    write_csv(curves, csv)
    print(f"[{scenario_name}] saved: {png.name}, {csv.name} in {OUT_DIR}")

[Listwise] saved: pr_interpolated_listwise.png, pr_interpolated_listwise.csv in rankings/analysis/pr_curves
[Pseudo-pointwise] saved: pr_interpolated_pseudo-pointwise.png, pr_interpolated_pseudo-pointwise.csv in rankings/analysis/pr_curves


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re

# --- CONFIG ---
INPUT_CSV = Path("rankings/eval_all_sorted_models.csv")
OUT_DIR = Path("rankings/analysis/delta_plots_seaborn")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- LOAD DATA ---
df = pd.read_csv(INPUT_CSV)

# --- Extract base model name ---
def extract_base_model(name):
    name = name.lower()
    name = name.replace("_", ".")
    name = name.replace(".sorted.ranks.", ".")
    name = name.replace(".ranks.", ".")
    name = name.replace("_sorted_ranks_", ".")
    name = name.replace("_ranks_", ".")
    name = re.sub(r"\.nl$", "", name)
    return name

df["BaseModel"] = df["Model"].apply(extract_base_model)

# --- Detect scenario ---
def detect_scenario(name):
    name = name.lower()
    if "sorted" in name:
        return "Listwise"
    elif "ranks" in name:
        return "Pointwise"
    return "Unknown"

df["Scenario"] = df["Model"].apply(detect_scenario)

# --- Split ---
listwise_df = df[df["Scenario"] == "Listwise"].copy()
pointwise_df = df[df["Scenario"] == "Pointwise"].copy()

# --- Merge ---
merged = pd.merge(
    listwise_df,
    pointwise_df,
    on="BaseModel",
    suffixes=("_listwise", "_pointwise")
)

# --- Metrics ---
metrics = ["R@5", "R@10", "MAP@10", "MRR@10", "nDCG@10", "nDCG@100"]

# Compute deltas
for m in metrics:
    merged[f"delta_{m}"] = merged[f"{m}_pointwise"] - merged[f"{m}_listwise"]

# --- Melt for one combined plot ---
delta_cols = [f"delta_{m}" for m in metrics]
plot_df = merged.melt(id_vars=["BaseModel"], value_vars=delta_cols,
                      var_name="Metric", value_name="Delta")
plot_df["Metric"] = plot_df["Metric"].str.replace("delta_", "")

# --- Plot ---
plt.figure(figsize=(12, 6))
sns.barplot(data=plot_df, x="BaseModel", y="Delta", hue="Metric")
plt.axhline(0, color="black", linewidth=1)
plt.ylabel("Δ (Pointwise - Listwise)")
plt.title("Delta across all metrics between Pointwise and Listwise Ranking")
plt.xticks(rotation=45, ha="right")
plt.legend(title="Metric", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(OUT_DIR / "delta_all_metrics.png", dpi=300)
plt.close()

print(f"Saved combined delta plot to {OUT_DIR / 'delta_all_metrics.png'}")

Saved combined delta plot to rankings/analysis/delta_plots_seaborn/delta_all_metrics.png


## Scatterplot 

In [30]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Prepare data from by_model dictionary
auc_data = []
for model, scen_curves in by_model.items():
    if "Listwise" in scen_curves and "Pseudo-pointwise" in scen_curves:
        listwise_auc = scen_curves["Listwise"][2]
        pseudo_auc = scen_curves["Pseudo-pointwise"][2]
        auc_data.append((model, listwise_auc, pseudo_auc))

df_auc = pd.DataFrame(auc_data, columns=["Model", "Listwise_AUC", "Pseudo_AUC"])

# Create scatter plot with seaborn
plt.figure(figsize=(6, 6))
sns.scatterplot(
    data=df_auc,
    x="Listwise_AUC", y="Pseudo_AUC",
    hue="Model", s=80, palette="tab10"
)

# Add diagonal line
plt.plot([0, 1], [0, 1], color="red", linestyle="--", linewidth=1)

# Annotate each point with model name
for _, row in df_auc.iterrows():
    plt.text(
        row["Listwise_AUC"] + 0.005, row["Pseudo_AUC"] + 0.005,
        row["Model"], fontsize=8
    )

plt.xlabel("Listwise AUC (11-pt Interpolated PR)")
plt.ylabel("Pseudo-pointwise AUC (11-pt Interpolated PR)")
plt.title("Listwise vs. Pseudo-pointwise Ranking — AUC Comparison")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.grid(True, linestyle="--", alpha=0.4)
plt.tight_layout()

# Save figure
plt.savefig("rankings/ranking_comparison_scatterplot.png", dpi=300)
plt.close()
print("Scatter plot saved as ranking_comparison_scatterplot.png")

Scatter plot saved as ranking_comparison_scatterplot.png


In [22]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- Prepare AUC data ---
auc_rows = []
for model, scen_curves in by_model.items():
    if model not in ["mE5large-instruct", "jina-embeddings-v3"]:  # exclude dense models
        for scenario, (R, P, auc) in scen_curves.items():
            if scenario in ["Listwise", "Pseudo-pointwise"]:
                auc_rows.append({
                    "Model": model,
                    "Ranking Method": scenario + "-Ranking",
                    "AUC": round(auc * 100, 2)  # percentage for readability
                })

df_auc = pd.DataFrame(auc_rows)

# Pivot for heatmap
heatmap_data = df_auc.pivot(index="Model", columns="Ranking Method", values="AUC")

# Match order like your current heatmap
model_order = [
    "Gemini-2.5-flash",
    "Qwen-3-235B",
    "GPT-4.1-mini",
    "GPT-4o-mini",
    "LLaMA-4-Scout",
    "LLaMA-3-70B"
]
heatmap_data = heatmap_data.reindex(model_order)

# Plot
plt.figure(figsize=(7, 5))
sns.heatmap(
    heatmap_data,
    annot=True, fmt=".2f", cmap="YlGnBu",
    cbar_kws={'label': 'AUC (11-pt Interpolated PR)'}
)

plt.title("AUC Heatmap by Model and Ranking Method")
plt.xlabel("Ranking Method")
plt.ylabel("Model")
plt.tight_layout()

# Save
plt.savefig("rankings/ranking_comparison_auc_heatmap.png", dpi=300)
plt.close()
print("Saved heatmap as ranking_comparison_auc_heatmap.png")

Saved heatmap as ranking_comparison_auc_heatmap.png
