# Analysis

In [None]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# Standard imports
import os

# External imports
import hydra
import rootutils
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from wandb.sdk.wandb_run import Run

import wandb

In [None]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

In [None]:
# Global paths
ROOT_DIR = root_path
ARTIFACT_DIR = os.path.join(ROOT_DIR, "artifacts")
FIGURE_DIR = os.path.join(ROOT_DIR, "report", "figures")
TABLE_DIR = os.path.join(ROOT_DIR, "report", "tables")

os.makedirs(FIGURE_DIR, exist_ok=True)
os.makedirs(TABLE_DIR, exist_ok=True)

In [None]:
# Initialise W&B
# Initialize wandb
WANDB_PROJECT = "homepage2vec"
WANDB_ENTITY = "ml-project-2-mlp"

# Initialize W&B API
api = wandb.Api()

# Get all runs
runs = api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
print(f"✅ Loaded {len(runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT})")

In [None]:
# Helpers
def runs_to_df(runs: list[Run]) -> pd.DataFrame:
    """
    Convert a list of W&B runs to a dataframe.
    """
    rows = []
    data_attr = ["name"]
    labeler_attr = ["name", "model", "fewshot", "features"]
    model_attr = ["name", "model", "fewshot", "features"]
    for run in runs[::-1]:
        data = {k:v for k,v  in run.config["data"].items() if k in data_attr}
        labeler = {k:v for k,v  in run.config["labeler"].items() if k in labeler_attr}
        model = {k:v for k,v  in run.config["model"].items() if k in model_attr}
        config = {**data, **labeler, **model}
        hparams = {
            "lr": run.config["model"]["optimizer"]["lr"],
            "weight_decay": run.config["model"]["optimizer"]["weight_decay"],
            "factor": run.config["model"]["scheduler"]["factor"],
            "batch_size": run.config["datamodule"]["batch_size"],
        }
        summary = {k:v for k, v in run.summary.items() if not k.startswith("_")}
        row = {**config, **hparams, **summary}
        rows.append(row)

    # Add multi-index
    columns = list(config.keys()) + list(hparams.keys()) + list(summary.keys())
    config_tuples = [("config", k) for k in config.keys()]
    hparams_tuples = [("hparams", k) for k in hparams.keys()]
    summary_tuples = [("summary", k) for k in summary.keys()]

    # Create dataframe
    run_df = pd.DataFrame(rows, index=[r.name for r in runs[::-1]], columns=columns)
    run_df.columns = pd.MultiIndex.from_tuples(
        config_tuples + hparams_tuples + summary_tuples,
    )

    return run_df

## Experiment 1: Fine-tuning on `original`

---

In this experiment we are verifying the annotation quality of the various `labelers` (specified in `conf/labelers`) by fine-tuning models on the `original` dataset with the labels provided by each labeler. The training parameters are kept constant for all the experiments.

In [None]:
# Filter runs for Experiment 1
GROUP = "exp1"

exp1_runs = [run for run in runs if run.group == GROUP]

print(f"✅ Loaded {len(exp1_runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")

In [None]:
# Convert to dataframe
exp1_runs_df = runs_to_df(exp1_runs)

In [None]:
# Top-5 runs
top_k = 5
exp1_runs_df.sort_values(by=("summary", "test/f1"), ascending=False).head(top_k)["summary"]

In [None]:
# Visualise best runs
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(
    data=exp1_runs_df,
    x=exp1_runs_df.index,
    y=("summary", "test/f1"),
    ax=ax
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Save figure
fig.savefig(os.path.join(FIGURE_DIR, "exp1_runs.pdf"), dpi=300, bbox_inches="tight")
print(f"✅ Saved figure to {FIGURE_DIR}")

## Experiment 2: Fine-tuning on `curlie-10000`

In [None]:
# TODO