### Setup

---

In [13]:
%load_ext autoreload
%autoreload 2

import pickle
import os

from ml_project_2_mlp import metrics, gpt

import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the data

---

In [15]:
# Ground truth crowdsourced data
labels_path = os.path.join("..", "data", "crowdsourced", "processed", "websites.csv")
labels = pd.read_csv(labels_path).rename(columns={"Input.uid": "wid"}).set_index("wid")

# GPT labeled data
folder_path = os.path.join("..", "data", "tld_domain_meta")
with open(os.path.join(folder_path, "labeled_data.pkl"), "rb") as f:
    c1_preds = pickle.load(f)

folder_path = os.path.join("..", "data", "tld_domain_meta_title_desc_kws")
with open(os.path.join(folder_path, "labeled_data.pkl"), "rb") as f:
    c2_preds = pickle.load(f)

folder_path = os.path.join("..", "data", "tld_domain_meta_title_desc_kws")
with open(os.path.join(folder_path, "labeled_data.pkl"), "rb") as f:
    c3_preds = pickle.load(f)

# Categories that GPT had to predict
categories = gpt.GPTLabeler.categories

### Analyse the performance on the individual contexts

---

In [43]:
predictions = [
    ("c1: tld, domain, meata", c1_preds),
    ("c2: c1 + title, desc, kws", c2_preds),
    ("c3: c2 + links, text", c3_preds),
]
score_names = ["accuracy", "precision", "recall", "f1"]


for context, preds in predictions:
    # Compute metrics
    performance = metrics.compute_metrics(labels, preds, categories)

    # Print metrics
    print(context)
    for score_name in score_names:
        print(f"- {score_name}: {performance[score_name]:.3f}")
    print()

c1: tld, domain, meata
- accuracy: 0.825
- precision: 0.540
- recall: 0.142
- f1: 0.225

c2: c1 + title, desc, kws
- accuracy: 0.824
- precision: 0.509
- recall: 0.390
- f1: 0.441

c3: c2 + links, text
- accuracy: 0.824
- precision: 0.509
- recall: 0.390
- f1: 0.441

