# Analysis

In [None]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# Standard imports
import os
import json

# External imports
import hydra
import rootutils
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve
from wandb.sdk.wandb_run import Run

import wandb

In [None]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

In [None]:
# Global paths
ROOT_DIR = root_path
ARTIFACT_DIR = os.path.join(ROOT_DIR, "artifacts")
FIGURE_DIR = os.path.join(ROOT_DIR, "report", "figures")
TABLE_DIR = os.path.join(ROOT_DIR, "report", "tables")

os.makedirs(FIGURE_DIR, exist_ok=True)
os.makedirs(TABLE_DIR, exist_ok=True)

In [None]:
# Global Labeling
rename_dict = {
    "human": "Human",
    "gpt3.5": "GPT-3.5",
    "gpt4": "GPT-4",
    "context1": "Context 1",
    "context2": "Context 2",
    "context3": "Context 3",
    "zeroshot": "0-shot",
    "oneshot": "1-shot",
    "f1": "Macro F1",
    "acc": "Acc",
    "precision": "Precision",
    "recall": "Recall",
    "lpp": "Labels Per Page",
    "Kids_and_Teens": "Kids & Teens",
}

def rename(x):
    return rename_dict.get(x, x)

def get_labeler_name(name: str):
    return " + ".join([rename_dict.get(n, n) for n in name.split("-")])

def get_metric_name(name: str):
    if "/" in name:
        split, metric = name.split("/")
        return f"{rename_dict.get(split, split)} {rename_dict.get(metric, metric)}"
    else:
        return rename_dict.get(name, name)

In [None]:
# Load categories
with open(os.path.join("..", "data", "meta", "categories.json"), "r") as f:
    categories_and_desc = json.load(f)

categories, categories_desc = zip(*categories_and_desc.items())
idx2categories = dict(enumerate(categories))

In [None]:
# Style and colors
sns.set_style("whitegrid")
sns.set_palette("colorblind")

In [None]:
# Initialise W&B
# Initialize wandb
WANDB_PROJECT = "homepage2vec"
WANDB_ENTITY = "ml-project-2-mlp"

# Initialize W&B API
api = wandb.Api()

# Get all runs
runs = api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
print(f"✅ Loaded {len(runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT})")

In [None]:
# Helpers
def extract_config(run: Run) -> dict:
    """
    Extracts the config from a run.
    """
    data_attr = ["name"]
    labeler_attr = ["name", "model", "fewshot", "features"]
    train_data = {f"train_data_{k}":v for k,v  in run.config.get("train_data", {}).items() if k in data_attr}
    test_data = {f"test_data_{k}":v for k,v  in run.config.get("test_data", {}).items() if k in data_attr}
    train_labeler = {f"train_labeler_{k}":v for k,v  in run.config.get("train_labeler", {}).items() if k in labeler_attr}
    test_labeler = {f"test_labeler_{k}":v for k,v  in run.config.get("test_labeler", {}).items() if k in labeler_attr}
    train_ratio = run.config.get("train_datamodule", {}).get("data_split", [None])[0]
    val_ratio, test_ratio = run.config.get("test_datamodule", {}).get("data_split", [None, None, None])[1:]

    config = {"id": run.id, "name": run.name, "finetune": run.config["finetune"], "train_ratio": train_ratio, "val_ratio": val_ratio, "test_ratio": test_ratio, **train_data, **test_data, **train_labeler, **test_labeler}

    return config

def extract_hparams(run: Run) -> dict:
    """
    Extracts the hparams from a run.
    """
    hparams = {
        "lr": run.config.get("model", {}).get("optimizer", {}).get("lr", None),
        "weight_decay": run.config.get("model", {}).get("optimizer", {}).get("weight_decay", None),
        "scheduler_factor": run.config.get("model", {}).get("scheduler", {}).get("factor", None),
        "batch_size": run.config.get("train_datamodule", {}).get("batch_size", None),
    }
    return hparams

def extract_summary(run: Run, exclude:list[str] = ["test/cm", "test/report"]) -> dict:
    """
    Extracts the summary from a run.
    """
    summary = {k:v for k, v in run.summary.items() if not k.startswith("_") and k not in exclude}
    return summary

def runs_to_df(runs: list[Run]) -> pd.DataFrame:
    """
    Convert a list of W&B runs to a dataframe.
    """
    # Extract information from runs
    rows = []
    for run in runs[::-1]:
        config = extract_config(run)
        hparams = extract_hparams(run)
        summary = extract_summary(run)
        rows.append({**config, **hparams, **summary})

    # Add multi-index
    columns = list(config.keys()) + list(hparams.keys()) + list(summary.keys())
    config_tuples = [("config", k) for k in config.keys()]
    hparams_tuples = [("hparams", k) for k in hparams.keys()]
    summary_tuples = [("summary", k) for k in summary.keys()]

    # Create dataframe
    run_df = pd.DataFrame(rows, columns=columns)
    run_df.columns = pd.MultiIndex.from_tuples(
        config_tuples + hparams_tuples + summary_tuples,
    )
    run_df.set_index(("config", "id"), inplace=True)

    return run_df

In [None]:
def best_runs(df_runs: pd.DataFrame, split: str="val", metric: str="f1") -> pd.DataFrame:
    """
    Get the best runs based on the validation metric for 
    each unique combination of data, labeler - specified 
    in the run name.

    Args:
        df_runs (pd.DataFrame): Dataframe of runs.
        metric (str): Metric to sort on.

    Returns:
        pd.DataFrame: Dataframe of best runs.
    """
    experiment_cols = [("config", "train_labeler_name"), ("config", "finetune")]
    unique_exps = df_runs[experiment_cols].drop_duplicates()
    best_runs = []
    for unique_exp in unique_exps.values:
        is_unique_exp = (df_runs[experiment_cols] == unique_exp).all(axis=1)
        best_exp_run = df_runs[is_unique_exp].sort_values(("summary", f"{split}/{metric}"), ascending=False).iloc[0]
        best_runs.append(best_exp_run)

    return pd.DataFrame(best_runs)

## Experiment 1: Finetuning on `original`

---

In this experiment we are verifying the annotation quality of the various `labelers` (specified in `conf/labelers`) by finetuning models on the `original` dataset with the labels provided by each labeler. The training parameters are kept constant for all the experiments.

In [None]:
# Filter runs for Experiment 1
GROUP = "exp1-2"

exp1_runs = [run for run in runs if run.group == GROUP]

print(f"✅ Loaded {len(exp1_runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")

In [None]:
# Convert to dataframe
exp1_runs_df = runs_to_df(exp1_runs)

exp1_runs_df

In [None]:
exp1_best_runs_df = best_runs(exp1_runs_df)

# Show best runs by F1
exp1_best_runs_df.sort_values(("summary", "test/f1"), ascending=False)

In [None]:
# Visualise best runs by metric
metrics = ["test/f1", "test/precision", "test/recall", "test/acc", "test/lpp"]
fig, axs = plt.subplots(nrows=5, figsize=(20, 20))
fig.tight_layout(pad=3.0)

for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=exp1_best_runs_df,
        x=exp1_best_runs_df[("config", "name")],
        y=("summary", metric),
        hue=("config", "finetune"),
        ax=ax
    )
    ax.set_xticks(ax.get_xticks())
    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [exp1_best_runs_df[exp1_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)
    ax.get_legend().set_title("Is Finetuned?")
    ax.get_legend().set_visible(False)

axs[0].get_legend().set_visible(True)
axs[-1].set_xlabel("Labelers", fontsize=14)

# Save figure
fig.savefig(os.path.join(FIGURE_DIR, "exp1-mf1.pdf"), dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {FIGURE_DIR}")

In [None]:
# Show performance in splits for labelers
def pivot_df(runs):
    # Pivot df such that all split gets a column for each metric
    pivoted_rows = []
    for _, row in runs.iterrows():
        for split in ["train", "val", "test"]:
            pivoted_row = {
                **row[["config", "hparams"]],
                ("summary", "split"): split,
                ("summary", "f1"): row[("summary", f"{split}/f1")],
                ("summary", "acc"): row[("summary", f"{split}/acc")],
                ("summary", "precision"): row[("summary", f"{split}/precision")] if split == "test" else None,
                ("summary", "recall"): row[("summary", f"{split}/recall")] if split == "test" else None,
                ("summary", "lpp"): row[("summary", f"{split}/lpp")] if split == "test" else None,
            }
            pivoted_rows.append(pivoted_row)

    df = pd.DataFrame(pivoted_rows)
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    return df

In [None]:
# Pivot dataframe
pivoted_exp1_best_runs_df = pivot_df(exp1_best_runs_df)
fig, axs = plt.subplots(nrows=2, figsize=(20, 10))
fig.tight_layout(pad=3.0)

metrics = ["f1", "acc"]
for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=pivoted_exp1_best_runs_df,
        x=("config", "name"),
        y=("summary", metric),
        hue=("summary", "split"),
        ax=ax
    )

    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [pivoted_exp1_best_runs_df[pivoted_exp1_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)

path = os.path.join(FIGURE_DIR, "exp1-splits.pdf")
fig.savefig(path, dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {path}")

### Category-wise performance

In [None]:
from sklearn.metrics import classification_report


def get_test_cm(run: Run) -> pd.DataFrame:
    """
    Extracts the test confusion matrix from a run.
    """
    test_cm = json.loads(run.summary.get("test/cm", None))
    if test_cm is None:
        return None
    test_cm = pd.DataFrame.from_dict(test_cm)
    test_cm["category"].replace(idx2categories, inplace=True)
    test_cm.set_index("category", inplace=True)
    return test_cm

def get_test_report(run: Run) -> pd.DataFrame:
    """
    Extracts the test report from a run.
    """
    test_report = json.loads(run.summary.get("test/report", None))
    if test_report is None:
        return None
    test_report = pd.DataFrame.from_dict(test_report)
    mapper_ = {str(idx): category for idx, category in idx2categories.items()}
    test_report["category"] = test_report["category"].map(lambda x: mapper_.get(x, x))
    return test_report

def get_test_targets(run: Run) -> pd.DataFrame:
    """
    Extracts the test predictions from a run.
    """
    test_targets = json.loads(run.summary.get("test/targets"))
    return pd.DataFrame(test_targets, columns=categories)

def get_test_preds(run: Run) -> pd.DataFrame:
    """
    Extracts the test predictions from a run.
    """
    test_preds = json.loads(run.summary.get("test/preds"))
    return pd.DataFrame(test_preds, columns=categories)

def get_test_probs(run: Run) -> pd.DataFrame:
    """
    Extracts the test probabilities from a run.
    """
    test_probs = json.loads(run.summary.get("test/probs"))
    return pd.DataFrame(test_probs, columns=categories)

In [None]:
def get_test_reports_df(runs: list[Run]) -> pd.DataFrame:
    test_reports_df = pd.DataFrame()
    for run in runs:
        run_config = extract_config(run)
        test_report = get_test_report(run)

        for k, v in run_config.items():
            test_report[k] = v

        # Concatenate
        test_reports_df = pd.concat([test_reports_df, test_report])

    return test_reports_df

In [None]:
# fig, axs = plt.subplots(nrows=3, figsize=(20, 10))
# subset = exp1_test_reports_df[exp1_test_reports_df["train_labeler_name"] == "human"]
# metrics = ["precision", "recall", "f1-score"]
# for ax, metric in zip(axs, metrics):
#     sns.barplot(
#         data=subset,
#         x="category",
#         y=metric,
#         hue="finetune",
#         ax=ax
#     )

## Experiment 2: Finetuning on `curlie-10000`

In [None]:
# Filter runs for Experiment 2
GROUP = "exp2-2"

exp2_runs = [run for run in runs if run.group == GROUP]

print(f"✅ Loaded {len(exp2_runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")

In [None]:
# Convert to dataframe
exp2_runs_df = runs_to_df(exp2_runs)
exp2_best_runs_df = best_runs(exp2_runs_df)

# Show best runs by F1
exp2_best_runs_df.sort_values(("summary", "test/f1"), ascending=False)

In [None]:
exp2_best_runs_df.summary[["test/f1"]]

In [None]:
# Table to LaTex
df = exp2_best_runs_df.copy()

index = [x.config["train_labeler_name"] if x.config["finetune"] else "Pretrained" for _, x in df.iterrows()]
index = [get_labeler_name(x) for x in index]
# hparams = df.hparams.where(df.config["finetune"], "N/A")
df = df.summary[["test/precision", "test/recall", "test/f1", "test/lpp"]]

df = df.set_index(pd.Index(index))

cols = {"test/precision": "Pr.", "test/recall": "Re.", "test/f1": "M.-F1", "test/lpp": "LPP"}
df = df[cols.keys()].rename(columns=cols)

df["Pr."] = df["Pr."] * 100
df["Re."] = df["Re."] * 100
df["M.-F1"] = df["M.-F1"] * 100

# Save the dataframe to a latex table
position = "!ht"
save_path = os.path.join(TABLE_DIR, "finetune-results.tex")
latex = df.to_latex(
    caption="TODO", 
    label="tab:finetune-results",
    escape=True,
    position=position,
    multirow=True,
    float_format="%.2f",
    multicolumn=True,
    multicolumn_format="c",
)

# Add \centering right after \begin{table}
latex = latex.replace("\\begin{table}" + f"[{position}]", "\\begin{table}" + f"[{position}]" + "\n\\centering")
latex = latex.replace("[t]", "[c]")
with open(save_path, "w") as f:
    f.write(latex)
print(f"✅ Saved table to {save_path}")

In [None]:
# Extract best runs
pretrained_runs = exp2_best_runs_df[exp2_best_runs_df[("config", "finetune")] == False]
finetuned_runs = exp2_best_runs_df[exp2_best_runs_df[("config", "finetune")] == True]

pretrained_run_ids = pretrained_runs.index.values
finetuned_run_ids = finetuned_runs.index.values

pretrained_runs = [run for run in exp2_runs if run.id in pretrained_run_ids]
finetuned_runs = [run for run in exp2_runs if run.id in finetuned_run_ids]

print(f"Got {len(pretrained_runs)} pretrained run(s) and {len(finetuned_runs)} finetuned run(s) for {GROUP}")

In [None]:
# Get best runs
pretrained_run = pretrained_runs[0]
finetuned_run = finetuned_runs[0]

pretrained_cms = get_test_cm(pretrained_run)
finetuned_cms = get_test_cm(finetuned_run)

In [None]:
# Visualise the macro F1
pretrained_report = get_test_report(pretrained_run)
finetuned_report = get_test_report(finetuned_run)

pretrained_report["model"] = "Pretrained"
finetuned_report["model"] = "GPT-3.5"

test_reports = pd.concat([pretrained_report, finetuned_report])

test_reports["category"] = test_reports["category"].map(lambda x: rename(x))
test_reports["f1-score"] = test_reports["f1-score"] * 100
test_reports = test_reports[test_reports["category"].isin(categories)]

fig, ax = plt.subplots(figsize=(16, 4))
fig.tight_layout(pad=3.0)
sns.barplot(
    data=test_reports,
    x="category",
    y="f1-score",
    hue="model",
    ax=ax
)
ax.set_xlabel("")
ax.set_ylabel("F1 (%)", fontsize=14)
labels = [rename(x.get_text()) for x in ax.get_xticklabels()]
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, horizontalalignment='right', fontsize=14)
ax.get_legend().set_title("")

# Add values to bars
for p in ax.patches:
    height = p.get_height()
    if height == 0:
        continue
    ax.text(p.get_x()+p.get_width()/2.,
            height + 1,
            f"{height:.1f}",
            ha="center", fontsize=12)

ax.set_ylim(0, 70)

# Save figure
path = os.path.join(FIGURE_DIR, "exp2-mf1.pdf")
fig.savefig(path, bbox_inches="tight")
print(f"✅ Saved figure to {path}")

In [None]:
# Visualise the precision-recall curve
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
y_test, y_prob = get_test_targets(finetuned_run), get_test_probs(finetuned_run)
y_test, y_prob = y_test.values, y_prob.values
train_ratio = np.array(
    [
        0.093,
        0.276,
        0.062,
        0.017,
        0.059,
        0.015,
        0.011,
        0.011,
        0.084,
        0.043,
        0.048,
        0.074,
        0.139,
        0.068,
    ]
)
num_samples = len(y_test)
pos_samples = num_samples * train_ratio
neg_samples = num_samples - pos_samples
pos_ratio = neg_samples / pos_samples
y_prob2 = y_prob / (y_prob + pos_ratio * (1-y_prob))
for k in range(14):
    pr, re, th = precision_recall_curve(y_test[:, k], y_prob[:, k])
    pr2, re2, th = precision_recall_curve(y_test[:, k], y_prob2[:, k])
    ax[0].plot(re, pr, label=idx2categories[k])
    ax[1].plot(re2, pr2, label=idx2categories[k])

In [None]:
# Visualise the confusion matrices for the best runs
fig, axs = plt.subplots(nrows=2, ncols=14, figsize=(4*14, 3*2))
for i, (_, pretrained_cm) in enumerate(pretrained_cms.iterrows()):
    sns.heatmap(
        [[pretrained_cm["tn"], pretrained_cm["fp"]], [pretrained_cm["fn"], pretrained_cm["tp"]]],
        ax=axs[0][i]
    )
    axs[0][i].set_title(pretrained_cm.name)
    axs[0][i].set_xlabel("Predicted")
    if i > 0:
        axs[0][i].set_yticks([])
        axs[0][i].set_ylabel("")

for i, (_, pretrained_cm) in enumerate(finetuned_cms.iterrows()):
    sns.heatmap(
        [[pretrained_cm["tn"], pretrained_cm["fp"]], [pretrained_cm["fn"], pretrained_cm["tp"]]],
        ax=axs[1][i]
    )
    axs[1][i].set_xlabel("Predicted")
    if i > 0:
        axs[1][i].set_yticks([])
        axs[1][i].set_ylabel("")

axs[0][0].set_ylabel("Actual", fontsize=14)
axs[0][1].set_ylabel("Actual", fontsize=14)

path = os.path.join(FIGURE_DIR, "exp2-cms.pdf")
fig.savefig(path, bbox_inches="tight")
print(f"✅ Saved figure to {path}")

In [None]:
# Pivot dataframe
pivoted_exp2_best_runs_df = pivot_df(exp2_best_runs_df)
fig, axs = plt.subplots(nrows=2, figsize=(20, 10))
fig.tight_layout(pad=3.0)

metrics = ["f1", "acc"]
for ax, metric in zip(axs, metrics):
    sns.barplot(
        data=pivoted_exp2_best_runs_df,
        x=("config", "name"),
        y=("summary", metric),
        hue=("summary", "split"),
        ax=ax
    )

    run_names = [x.get_text() for x in ax.get_xticklabels()]
    rows = [pivoted_exp2_best_runs_df[pivoted_exp2_best_runs_df[("config", "name")] == run_name].iloc[0] for run_name in run_names]
    xtick_labels = [rename(row[("config", "train_labeler_name")]) for row in rows]
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(xtick_labels)
    ax.set_xlabel("")
    ax.set_ylabel(rename(metric), fontsize=14)

path = os.path.join(FIGURE_DIR, "exp1-splits.pdf")
fig.savefig(path, dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {path}")