# Model Comparison (ID-Retrieval vs Relevance-Classification)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

output_folder = "retrievals/analysis"
os.makedirs(output_folder, exist_ok=True)

macro_df = pd.read_csv("retrievals/evaluation/macro_scores.csv")
micro_df = pd.read_csv("retrievals/evaluation/micro_scores.csv")

macro_long = macro_df.melt(id_vars=["Model","Method"],
                           value_vars=["Precision","Recall","F1-Score"],
                           var_name="Metric", value_name="Score")
micro_long = micro_df.melt(id_vars=["Model","Method"],
                           value_vars=["Precision","Recall","F1-Score"],
                           var_name="Metric", value_name="Score")

macro_f1 = macro_long[macro_long["Metric"] == "F1-Score"]
micro_f1 = micro_long[micro_long["Metric"] == "F1-Score"]

plt.figure(figsize=(12,6))
sns.barplot(data=macro_f1, x="Model", y="Score", hue="Method", palette="Set2", errorbar=None, dodge=True)
plt.title("Macro-Averaged F1-Score per Model"); plt.ylim(0,1); plt.ylabel("F1-Score"); plt.xlabel("Model")
plt.xticks(rotation=45, ha="right"); plt.legend(title="Method", loc="upper right")
plt.tight_layout(); plt.savefig(os.path.join(output_folder,"macro_metrics_f1_precision.png"), dpi=300); plt.show()

plt.figure(figsize=(12,6))
sns.barplot(data=micro_f1, x="Model", y="Score", hue="Method", palette="Set2", errorbar=None, dodge=True)
plt.title("Micro-Averaged F1-Score per Model"); plt.ylim(0,1); plt.ylabel("F1-Score"); plt.xlabel("Model")
plt.xticks(rotation=45, ha="right"); plt.legend(title="Method", loc="upper right")
plt.tight_layout(); plt.savefig(os.path.join(output_folder,"micro_metrics_comparison_f1.png"), dpi=300); plt.show()

In [None]:
import pandas as pd

macro_path = "retrievals/evaluation/macro_scores.csv"
micro_path = "retrievals/evaluation/micro_scores.csv"

name_map = {
    "gemini2.5.flash": "Gemini-2.5-flash",
    "qwen3.235b": "Qwen-3-235N",
    "llama4.scout": "Llama-4-Scout",
    "llama3.3.70b": "Llama-3.3-70B",
    "gpt4o.mini": "GPT-4o-mini",
    "gpt4.1.mini": "GPT-4.1-mini"
}

def replace_model_names(file_path):
    df = pd.read_csv(file_path)
    if 'model' in df.columns:
        df['model'] = df['model'].replace(name_map)
    else:
        for col in df.columns:
            if df[col].astype(str).str.lower().isin(name_map.keys()).any():
                df[col] = df[col].replace(name_map)
                break
    df.to_csv(file_path, index=False)

replace_model_names(macro_path)
replace_model_names(micro_path)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

output_folder = "retrievals/analysis"
os.makedirs(output_folder, exist_ok=True)

macro_df = pd.read_csv("retrievals/evaluation/macro_scores.csv")
micro_df = pd.read_csv("retrievals/evaluation/micro_scores.csv")

macro_long = macro_df.melt(
    id_vars=["Model", "Method"],
    value_vars=["Precision", "Recall", "F1-Score"],
    var_name="Metric",
    value_name="Score"
)
micro_long = micro_df.melt(
    id_vars=["Model", "Method"],
    value_vars=["Precision", "Recall", "F1-Score"],
    var_name="Metric",
    value_name="Score"
)

macro_f1 = macro_long[macro_long["Metric"] == "Precision"]
micro_f1 = micro_long[micro_long["Metric"] == "Precision"]
macro_avg_f1 = macro_f1.groupby("Method", as_index=False)["Score"].mean()
micro_avg_f1 = micro_f1.groupby("Method", as_index=False)["Score"].mean()

plt.figure(figsize=(6, 6))
sns.barplot(
    data=macro_avg_f1,
    x="Method", y="Score",
    palette="Set2",
    errorbar=None
)
plt.title("Average Macro Precision per Method")
plt.ylim(0, 1)
plt.ylabel("Precision")
plt.xlabel("Method")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.legend([], [], frameon=False) 
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "macro_avg_precision_per_method.png"), dpi=300)
plt.show()

plt.figure(figsize=(6, 6))
sns.barplot(
    data=micro_avg_f1,
    x="Method", y="Score",
    palette="Set2",
    errorbar=None
)
plt.title("Average Micro Precision per Method")
plt.ylim(0, 1)
plt.ylabel("Precision")
plt.xlabel("Method")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.legend([], [], frameon=False)
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "micro_avg_precision_per_method.png"), dpi=300)
plt.show()

## Retrieval Statistics

In [None]:
import json
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.2f}'.format

INPUT_DIR = Path("retrievals/json") 
GOLD_PATH = Path("gold_data/gold_standard_nl.json") 

OUTPUT_DIR = Path("retrievals/analysis")
OUTPUT_AGG_CSV = OUTPUT_DIR / "retrieval_counts_agg.csv"
OUTPUT_DETAILED_CSV = OUTPUT_DIR / "retrieval_counts_detailed.csv"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def parse_model_and_scenario(stem: str):
    s = stem.lower()
    if "_id_retrieval_" in s:
        scenario = "ID-Retrieval"
    elif "_bin_class_retrieval_" in s:
        scenario = "Relevance-Classification"
    else:
        scenario = "Unknown"
    return stem, scenario 

def pretty_model_label(stem: str) -> str:
    s = stem.lower()
    # normalize separators
    s = s.replace("__", "_").replace("-", "_")

    if "qwen" in s:
        return "Qwen-3-235B"
    if "gemini" in s and ("2.5" in s or "2_5" in s) and "flash" in s:
        return "Gemini-2.5-flash"
    if ("gpt4o" in s or "gpt_4o" in s or "gpt-4o" in s) and ("mini" in s):
        return "GPT-4o-mini"
    if ("gpt4.1" in s or "gpt_4_1" in s or "gpt-4.1" in s) and ("mini" in s):
        return "GPT-4.1-mini"
    if "llama3" in s and ("70b" in s or "70_b" in s or "3.3" in s or "3_3" in s):
        return "LLaMA-3-70B"
    if ("llama4" in s or "llama_4" in s) and ("scout" in s):
        return "LLaMA-4-Scout"
    return stem

def normalize_str(x):
    return str(x).strip()

with open(GOLD_PATH, "r", encoding="utf-8") as f:
    gold = json.load(f)
gold_norm = {normalize_str(q): set(normalize_str(doc) for doc in docs) for q, docs in gold.items()}

files = sorted([p for p in INPUT_DIR.glob("*.json")])
if not files:
    print(f"No JSON files found in {INPUT_DIR}")
    raise SystemExit

detailed_rows = []

for fp in files:
    stem = fp.stem
    raw_model, scenario = parse_model_and_scenario(stem)
    model_label = pretty_model_label(raw_model)

    with open(fp, "r", encoding="utf-8") as f:
        data = json.load(f)

    preds = {normalize_str(q): [normalize_str(x) for x in ids] for q, ids in data.items()}

    for qid in gold_norm.keys():
        ranks = preds.get(qid, [])
        gold_set = gold_norm[qid]

        retrieved_count = len(ranks)
        unique_ranks = list(dict.fromkeys(ranks))
        retrieved_unique_count = len(unique_ranks)

        tp_raw = sum(1 for rid in ranks if rid in gold_set)
        tp_unique = len(set(unique_ranks) & gold_set)

        detailed_rows.append({
            "model": model_label,   
            "scenario": scenario,
            "query_id": qid,
            "retrieved_count": retrieved_count,
            "retrieved_unique_count": retrieved_unique_count,
            "gold_count": len(gold_set),
            "true_pos_count": tp_raw,
            "true_pos_unique_count": tp_unique,
            "file": fp.name,
        })

df_detail = pd.DataFrame(detailed_rows)

agg = (
    df_detail
    .groupby(["model", "scenario"], as_index=False)
    .agg(
        queries=("query_id", "nunique"),
        avg_retrieved=("retrieved_count", "mean"),
        avg_retrieved_unique=("retrieved_unique_count", "mean"),
        avg_gold=("gold_count", "mean"),
        avg_true_pos=("true_pos_count", "mean"),
        avg_true_pos_unique=("true_pos_unique_count", "mean"),
    )
    .sort_values(["model", "scenario"])
)

df_detail.to_csv(OUTPUT_DETAILED_CSV, index=False, float_format="%.2f")
agg.to_csv(OUTPUT_AGG_CSV, index=False, float_format="%.2f")

sns.set_theme(style="whitegrid")

plot_long = agg.melt(
    id_vars=["model", "scenario", "avg_gold"],
    value_vars=["avg_retrieved", "avg_true_pos"],
    var_name="Metric",
    value_name="value"
)

plot_long["Series"] = plot_long.apply(
    lambda r: f"{'Retrieved' if r['Metric']=='avg_retrieved' else 'True Pos'} "
              f"({'ID' if r['scenario']=='ID-Retrieval' else 'RC'})",
    axis=1
)

series_order = ["Retrieved (ID)", "Retrieved (RC)", "True Pos (ID)", "True Pos (RC)"]

models_order = sorted(agg["model"].unique().tolist())
plot_long["model"] = pd.Categorical(plot_long["model"], categories=models_order, ordered=True)

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=plot_long,
    x="model", y="value", hue="Series",
    hue_order=series_order, dodge=True
)

avg_gold = agg["avg_gold"].mean()
ax.axhline(avg_gold, color="gray", linestyle="--", linewidth=1, alpha=0.7)
ax.text(len(models_order) - 0.5, avg_gold + 0.1,
        f"Avg. gold = {avg_gold:.2f}",
        color="gray", fontsize=9, ha="right", va="bottom")

ax.set_title("Avg. Retrieved vs True Positives per Model and Scenario")
ax.set_xlabel("")
ax.set_ylabel("Average count")
plt.xticks(rotation=45, ha="right")
plt.grid(True, axis="y", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "retrieved_vs_truepos_grouped_bar.png", dpi=300)
plt.close()

#  F1-score Barplots with Error Bars or Per-Query Variance

### Making json files with per-query metrics

In [None]:
import os
import json

gold_path = "../sampling_hard_negatives/gold_standard_nl.json"
retrievals_folder = "retrievals/json"
output_folder = "retrievals/analysis/per_query_scores"

os.makedirs(output_folder, exist_ok=True)

with open(gold_path) as f:
    gold_data = json.load(f)

model_files = [f for f in os.listdir(retrievals_folder) if f.endswith(".json")]

for filename in model_files:
    with open(os.path.join(retrievals_folder, filename)) as f:
        model_data = json.load(f)

    per_query_scores = {}

    for query_id, gold_docs in gold_data.items():
        gold_set = set(gold_docs)
        pred_set = set(model_data.get(query_id, []))

        tp = len(gold_set & pred_set)
        fp = len(pred_set - gold_set)
        fn = len(gold_set - pred_set)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        per_query_scores[query_id] = {
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1": round(f1, 4)
        }

    model_name = filename.replace(".json", "").replace("evaluation_", "")
    output_path = os.path.join(output_folder, f"{model_name}.json")
    with open(output_path, "w") as out_f:
        json.dump(per_query_scores, out_f, indent=2)