In [None]:
# noqa: E402
%reload_ext autoreload
%autoreload 2

import os 
import sys
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd


import rootutils
import hydra

In [None]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

In [None]:
# Costs per token based on https://openai.com/pricing
GPT4_COST_PER_INP_TOKEN = 0.00001
GPT4_COST_PER_OUT_TOKEN = 0.00003
GPT3_5_COST_PER_INP_TOKEN = 0.000001
GPT3_5_COST_PER_OUT_TOKEN = 0.000002

## Websites

---

There are three copora of websites in this dataset:

* `original`: 770 websites from the crowdsourced dataset from the [Homepage2Vec paper](https://arxiv.org/abs/1905.09786)
* `gpt`: 250 common websites obtained by prompting GPT-4 (see [Prompts](https://chat.openai.com/share/a76c8b9b-a659-4b15-9ab0-d94af4733d58))
* `curlie`: A filtered version of the [curlie](https://curlie.org) dataset, containing ~1M websites

For each website, the repository contains a CSV file at the path `data/raw/<corpus>.csv` with the two columns - `wid` and `url`. The `wid` is a unique identifier for the website, and the `url` is the URL of the website.

In [None]:
# Initialise Config
original_cfg = hydra.compose(config_name="eda", overrides=["data=original"])
gpt_cfg = hydra.compose(config_name="eda", overrides=["data=gpt"])
curlie_cfg = hydra.compose(config_name="eda", overrides=["data=curlie"])

In [None]:
# Load classes
path = os.path.join(root_path, "data", "meta", "categories.txt")
with open(path) as f:
    classes = f.read().splitlines()

print(classes)

### Original Data

This is the data that was used to test the model in the original paper. 

In [None]:
# Raw data
original_data = hydra.utils.instantiate(original_cfg.data)

raw_data = original_data.get_raw_data()
processed_data = original_data.get_processed_data()
embedded_data = original_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

### GPT

In [None]:
# Raw data
gpt_data = hydra.utils.instantiate(gpt_cfg.data)

raw_data = gpt_data.get_raw_data()
processed_data = gpt_data.get_processed_data()
embedded_jdata = gpt_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

### Curlie

In [None]:
# Raw data
curlie_data = hydra.utils.instantiate(curlie_cfg.data)

raw_data = curlie_data.get_raw_data()
processed_data = curlie_data.get_processed_data()
embedded_jdata = curlie_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

## Labelers

---

There are multiple GPT labeler instances that can be used to label the data. The labelers are defined in the labelers module. These are:

* `human`: Uses human labels from the [Homepage2Vec paper](https://arxiv.org/abs/1905.09786) -- only works for the `original` corpus
* `gpt-labeler1`: Uses information on the `tld`, `domain`, `metatags`
* `gpt-labeler2`: Uses information like `gpt-labeler1` +  `title`, `description`, `keywords`
* `gpt-labeler3`: Uses information like `gpt-labeler2` +  `links` and `text`

### Labeling Quality

The goal of all GPT labelers is to replicate the ground truth labels provide by the human annotators as closely as possible. As we only have human annotations for the original dataset, we can only evaluate the labelers on this dataset.

In [None]:
# Initialise configuration for all labelers
labeler_names = ["human", "gpt3.5-context0", "gpt3.5-context1", "gpt3.5-context2", "gpt3.5-context3"] 

original_labeler_cfg = {labeler: hydra.compose(config_name="eda", overrides=[f"labeler={labeler}"]) for labeler in labeler_names}

In [None]:
# Instantiate data
original_data = hydra.utils.instantiate(original_cfg.data)

In [None]:
# Instantiate labelers
labelers = {labeler: hydra.utils.instantiate(cfg.labeler, data=original_data) for labeler, cfg in original_labeler_cfg.items()}

Let's verify that the labelers are working as expected.

In [None]:
num_processed_websites = len(original_data.get_processed_data())

print(f"ℹ️ Number of processed websites: {num_processed_websites}")
for name, labeler in labelers.items():
    num_labels = len(labeler.get_labels())
    print(f"ℹ️ Number of {name} labels: {num_labels}")

This makes sense - the GPT labelers only provide labels for the websites that could be scraped, which is 761, while the ground truth labels are provided for all 840 websites. We will have to match the labels by the website ID accordingly when we evaluate the labelers. Next, we will process the returned GPT labels - we will iterate over all the websites in the original dataset, and collect all valid annotations and give an overview of the reasons for invalid labels - if there are any.

### Labeling statistics

In [None]:
def process_labels(labeler):
    match labeler.name:
        case "human":
            valid_labels = {wid: website["labels"] for wid, website in labeler.get_labels().items()}
            return valid_labels, [], 0
        case _:
            valid_labels = {}
            invalid_labels = []
            total_duration = 0
            for wid, website in labeler.get_labels().items():
                if website["is_valid"]:
                    valid_labels[wid] = website["labels"]
                else:
                    invalid_labels.append(website["reason_invalid"])
                total_duration += website["duration"]
            avg_duration = total_duration / len(labeler.get_labels())
            return valid_labels, invalid_labels, avg_duration

In [None]:
labels = {}
for name, labeler in labelers.items():
    valid_labels, reasons_invalid, avg_duration = process_labels(labeler)
    print(f"ℹ️ {name}: Valid {len(valid_labels)}, Invalid: {len(reasons_invalid)}. Avg. Duration: {avg_duration:.2f}s")
    labels[name] = valid_labels

Cool, most labels are valid. Let's now match the labels of the GPT labelers with the human labels by the website id (wid).

In [None]:
def match_labels(labels1, labels2, subset = None):
    wid1 = set(labels1.keys())
    wid2 = set(labels2.keys())
    matched_wid = wid1 & wid2
    if subset:
        matched_wid = matched_wid & subset

    labels1 = [labels1[wid] for wid in matched_wid]
    labels2 = [labels2[wid] for wid in matched_wid]

    return labels1, labels2

In [None]:
# Accuracy of gpt-labeler1
labels1, labels2 = match_labels(labels["human"], labels["gpt3.5-context1"])
print(classification_report(labels1, labels2, zero_division=0, target_names=classes))

In [None]:
# Accuracy of gpt-labeler2
labels1, labels2 = match_labels(labels["human"], labels["gpt3.5-context2"])
print(classification_report(labels1, labels2, zero_division=0, target_names=classes))

In [None]:
# Accuracy of gpt-labeler3
labels1, labels2 = match_labels(labels["human"], labels["gpt3.5-context3"])
print(classification_report(labels1, labels2, zero_division=0, target_names=classes))

In [None]:
# Let's investigate the accuracy for websites that contained all the features for context2
wids = set(map(str, [wid for wid, website in original_data.get_processed_data().items() if website["title"] is not None and website["description"] is not None and website["keywords"] is not None]))

# Accuracy of gpt-labeler3
labels1, labels2 = match_labels(labels["human"], labels["gpt3.5-context2"], subset=wids)
print(classification_report(labels1, labels2, zero_division=0, target_names=classes))

### Class Distribution

In [None]:
# Class distribution
import numpy as np

def get_class_dist(labeler, classes=None):
    labels = np.array([website["labels"] for _, website in labeler.get_labels().items()])

    if classes:
        return {label: count for label, count in zip(classes, labels.sum(0))}
    else:
        return {label: count for label, count in zip(range(labels.shape[1]), labels.sum(0))}


class_dists = {name: get_class_dist(labeler, classes=classes) for name, labeler in labelers.items()}

In [None]:
rows = []
for labeler in labeler_names:
    class_dist = class_dists[labeler]
    for c in classes:
        rows.append({"labeler": labeler, "category": c, "count": class_dist[c]})

df = pd.DataFrame(rows)

fig, ax = plt.subplots(figsize=(20, 5))
sns.barplot(
    df,
    x="category",
    y="count",
    hue="labeler",
    ax=ax
)