# Analysis

In [None]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# Standard imports
import os
import json

# External imports
import hydra
import rootutils
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from wandb.sdk.wandb_run import Run

import wandb

In [None]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

In [None]:
# Global paths
ROOT_DIR = root_path
ARTIFACT_DIR = os.path.join(ROOT_DIR, "artifacts")
FIGURE_DIR = os.path.join(ROOT_DIR, "report", "figures")
TABLE_DIR = os.path.join(ROOT_DIR, "report", "tables")

os.makedirs(FIGURE_DIR, exist_ok=True)
os.makedirs(TABLE_DIR, exist_ok=True)

In [None]:
# Global Labeling
rename_dict = {
    "human": "Human",
    "gpt3.5-zeroshot-context1": "GPT-3.5 +\n0-shot +\nContext 1",
    "gpt3.5-zeroshot-context2": "GPT-3.5 +\n0-shot +\nContext 2",
    "gpt3.5-zeroshot-context3": "GPT-3.5 +\n0-shot +\nContext 3",
    "gpt3.5-oneshot-context1": "GPT-3.5 +\n1-shot +\nContext 1",
    "gpt3.5-oneshot-context2": "GPT-3.5 +\n1-shot +\nContext 2",
    "gpt3.5-oneshot-context3": "GPT-3.5 +\n1-shot +\nContext 3",
    "gpt4-zeroshot-context2": "GPT-4 +\n0-shot +\nContext 2",
    "gpt4-oneshot-context2": "GPT-4 +\n1-shot +\nContext 2",
    "f1": "Macro F1 (%)",
    "acc": "Acc. (%)",
    "test/f1": "Test Macro F1 (%)",
    "test/acc": "Test Acc. (%)",
    "test/precision": "Test Precision (%)",
    "test/recall": "Test Recall (%)",
    "test/lpp": "Labels Per Page",
}

with open(os.path.join("..", "data", "meta", "categories.json"), "r") as f:
    categories_and_desc = json.load(f)

categories, categories_desc = zip(*categories_and_desc.items())
idx2categories = dict(enumerate(categories))

def rename(x):
    return rename_dict.get(x, x)

In [None]:
# Style and colors
sns.set_style("whitegrid")
sns.set_palette("colorblind")

In [None]:
# Initialise W&B
# Initialize wandb
WANDB_PROJECT = "homepage2vec"
WANDB_ENTITY = "ml-project-2-mlp"

# Initialize W&B API
api = wandb.Api()

# Get all runs
runs = api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
print(f"✅ Loaded {len(runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT})")

In [None]:
# Helpers
def extract_config(run: Run) -> dict:
    """
    Extracts the config from a run.
    """
    data_attr = ["name"]
    labeler_attr = ["name", "model", "fewshot", "features"]
    train_data = {f"train_data_{k}":v for k,v  in run.config.get("train_data", {}).items() if k in data_attr}
    test_data = {f"test_data_{k}":v for k,v  in run.config.get("test_data", {}).items() if k in data_attr}
    train_labeler = {f"train_labeler_{k}":v for k,v  in run.config.get("train_labeler", {}).items() if k in labeler_attr}
    test_labeler = {f"test_labeler_{k}":v for k,v  in run.config.get("test_labeler", {}).items() if k in labeler_attr}
    train_ratio = run.config.get("train_datamodule", {}).get("data_split", [None])[0]
    val_ratio, test_ratio = run.config.get("test_datamodule", {}).get("data_split", [None, None, None])[1:]

    config = {"id": run.id, "name": run.name, "finetune": run.config["finetune"], "train_ratio": train_ratio, "val_ratio": val_ratio, "test_ratio": test_ratio, **train_data, **test_data, **train_labeler, **test_labeler}

    return config

def extract_hparams(run: Run) -> dict:
    """
    Extracts the hparams from a run.
    """
    hparams = {
        "lr": run.config.get("model", {}).get("optimizer", {}).get("lr", None),
        "weight_decay": run.config.get("model", {}).get("optimizer", {}).get("weight_decay", None),
        "scheduler_factor": run.config.get("model", {}).get("scheduler", {}).get("factor", None),
        "batch_size": run.config.get("datamodule", {}).get("batch_size", None),
    }
    return hparams

def extract_summary(run: Run, exclude:list[str] = ["test/cm", "test/report"]) -> dict:
    """
    Extracts the summary from a run.
    """
    summary = {k:v for k, v in run.summary.items() if not k.startswith("_") and k not in exclude}
    return summary

def runs_to_df(runs: list[Run]) -> pd.DataFrame:
    """
    Convert a list of W&B runs to a dataframe.
    """
    # Extract information from runs
    rows = []
    for run in runs[::-1]:
        config = extract_config(run)
        hparams = extract_hparams(run)
        summary = extract_summary(run)
        rows.append({**config, **hparams, **summary})

    # Add multi-index
    columns = list(config.keys()) + list(hparams.keys()) + list(summary.keys())
    config_tuples = [("config", k) for k in config.keys()]
    hparams_tuples = [("hparams", k) for k in hparams.keys()]
    summary_tuples = [("summary", k) for k in summary.keys()]

    # Create dataframe
    run_df = pd.DataFrame(rows, columns=columns)
    run_df.columns = pd.MultiIndex.from_tuples(
        config_tuples + hparams_tuples + summary_tuples,
    )
    run_df.set_index(("config", "id"), inplace=True)

    return run_df

In [None]:
def best_runs(df_runs: pd.DataFrame, split: str="val", metric: str="f1") -> pd.DataFrame:
    """
    Get the best runs based on the validation metric for 
    each unique combination of data, labeler - specified 
    in the run name.

    Args:
        df_runs (pd.DataFrame): Dataframe of runs.
        metric (str): Metric to sort on.

    Returns:
        pd.DataFrame: Dataframe of best runs.
    """
    experiment_cols = [("config", "train_labeler_name"), ("config", "finetune")]
    unique_exps = df_runs[experiment_cols].drop_duplicates()
    best_runs = []
    for unique_exp in unique_exps.values:
        is_unique_exp = (df_runs[experiment_cols] == unique_exp).all(axis=1)
        print(is_unique_exp)
        best_exp_run = df_runs[is_unique_exp].sort_values(("summary", f"{split}/{metric}"), ascending=False).iloc[0]
        best_runs.append(best_exp_run)

    return pd.DataFrame(best_runs)

## Experiment 1: Fine-tuning on `original`

---

In this experiment we are verifying the annotation quality of the various `labelers` (specified in `conf/labelers`) by fine-tuning models on the `original` dataset with the labels provided by each labeler. The training parameters are kept constant for all the experiments.

In [None]:
# Filter runs for Experiment 1
GROUP = "exp1"

exp1_runs = [run for run in runs if run.group == GROUP]

print(f"✅ Loaded {len(exp1_runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")

In [None]:
# Convert to dataframe
exp1_runs_df = runs_to_df(exp1_runs)

exp1_runs_df

In [None]:
exp1_best_runs_df = best_runs(exp1_runs_df)

# Show best runs by F1
exp1_best_runs_df.sort_values(("summary", "test/f1"), ascending=False)

In [None]:
exp1_best_runs_df.summary

In [None]:
# Visualise best runs by metric
metrics = ["test/f1", "test/precision", "test/recall", "test/acc", "test/lpp"]
fig, axs = plt.subplots(nrows=5, figsize=(20, 20))
fig.tight_layout(pad=3.0)

for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=exp1_best_runs_df,
        x=exp1_best_runs_df[("config", "name")],
        y=("summary", metric),
        hue=("config", "finetune"),
        ax=ax
    )
    ax.set_xticks(ax.get_xticks())
    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [exp1_best_runs_df[exp1_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)
    ax.get_legend().set_title("Is Finetuned?")
    ax.get_legend().set_visible(False)

axs[0].get_legend().set_visible(True)
axs[-1].set_xlabel("Labelers", fontsize=14)

# Save figure
fig.savefig(os.path.join(FIGURE_DIR, "exp1-mf1.pdf"), dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {FIGURE_DIR}")

In [None]:
# Show performance in splits for labelers
def pivot_df(runs):
    # Pivot df such that all split gets a column for each metric
    pivoted_rows = []
    for _, row in runs.iterrows():
        for split in ["train", "val", "test"]:
            pivoted_row = {
                **row[["config", "hparams"]],
                ("summary", "split"): split,
                ("summary", "f1"): row[("summary", f"{split}/f1")],
                ("summary", "acc"): row[("summary", f"{split}/acc")],
                ("summary", "precision"): row[("summary", f"{split}/precision")] if split == "test" else None,
                ("summary", "recall"): row[("summary", f"{split}/recall")] if split == "test" else None,
                ("summary", "lpp"): row[("summary", f"{split}/lpp")] if split == "test" else None,
            }
            pivoted_rows.append(pivoted_row)

    df = pd.DataFrame(pivoted_rows)
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    return df

In [None]:
# Pivot dataframe
pivoted_exp1_best_runs_df = pivot_df(exp1_best_runs_df)
fig, axs = plt.subplots(nrows=2, figsize=(20, 10))
fig.tight_layout(pad=3.0)

metrics = ["f1", "acc"]
for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=pivoted_exp1_best_runs_df,
        x=("config", "name"),
        y=("summary", metric),
        hue=("summary", "split"),
        ax=ax
    )

    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [pivoted_exp1_best_runs_df[pivoted_exp1_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)

path = os.path.join(FIGURE_DIR, "exp1-splits.pdf")
fig.savefig(path, dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {path}")

### Category-wise performance

In [None]:
def get_test_cm(run: Run) -> pd.DataFrame:
    """
    Extracts the test confusion matrix from a run.
    """
    test_cm = json.loads(run.summary.get("test/cm", None))
    if test_cm is None:
        return None
    test_cm = pd.DataFrame.from_dict(test_cm)
    test_cm["category"].replace(idx2categories, inplace=True)
    test_cm.set_index("category", inplace=True)
    return test_cm

def get_test_report(run: Run) -> pd.DataFrame:
    """
    Extracts the test report from a run.
    """
    test_report = json.loads(run.summary.get("test/report", None))
    if test_report is None:
        return None
    test_report = pd.DataFrame.from_dict(test_report)
    mapper_ = {str(idx): category for idx, category in idx2categories.items()}
    test_report["category"] = test_report["category"].map(lambda x: mapper_.get(x, x))
    return test_report

def get_test_reports_df(runs: list[Run]) -> pd.DataFrame:
    test_reports_df = pd.DataFrame()
    for run in runs:
        run_config = extract_config(run)
        test_report = get_test_report(run)

        for k, v in run_config.items():
            test_report[k] = v

        # Concatenate
        test_reports_df = pd.concat([test_reports_df, test_report])

    return test_reports_df

In [None]:
# Get class-wise test-reports
exp1_test_reports_df = get_test_reports_df(exp1_runs)

In [None]:
# 
fig, axs = plt.subplots(nrows=3, figsize=(20, 10))
subset = exp1_test_reports_df[exp1_test_reports_df["train_labeler_name"] == "human"]
metrics = ["precision", "recall", "f1-score"]
for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=subset,
        x="category",
        y=metric,
        hue="finetune",
        ax=ax
    )

## Experiment 2: Fine-tuning on `curlie-10000`

In [None]:
# Filter runs for Experiment 2
GROUP = "exp2"

exp2_runs = [run for run in runs if run.group == GROUP]

print(f"✅ Loaded {len(exp2_runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")

In [None]:
# Convert to dataframe
exp2_runs_df = runs_to_df(exp2_runs)

In [None]:
exp2_runs_df.sort_values(("summary", "test/f1"), ascending=False).iloc[0]

In [None]:
exp2_best_runs_df = best_runs(exp2_runs_df, split="val", metric="f1_best")

exp2_best_runs_df

In [None]:
# Pivot dataframe
pivoted_exp2_best_runs_df = pivot_df(exp2_best_runs_df)
fig, axs = plt.subplots(nrows=2, figsize=(20, 10))
fig.tight_layout(pad=3.0)

metrics = ["f1", "acc"]
for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=pivoted_exp2_best_runs_df,
        x=("config", "name"),
        y=("summary", metric),
        hue=("summary", "split"),
        ax=ax
    )

    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [pivoted_exp2_best_runs_df[pivoted_exp2_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)

path = os.path.join(FIGURE_DIR, "exp1-splits.pdf")
fig.savefig(path, dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {path}")

In [None]:
# Visualise performance of pre-trained against fine-tuned models
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(
    data=exp2_best_runs_df,
    x=("config", "name"),
    y=("summary", "test/f1"),
    hue=("config", "finetune"),
    ax=ax
)

ax.set_xlabel("")
ax.set_ylabel(rename(metric), fontsize=14)
ax.get_legend().set_title("Is Finetuned?")
ax.get_legend().set_visible(False)

In [None]:
exp2_test_reports_df = get_test_reports_df(exp2_runs)

exp2_test_reports_df