In [None]:
# noqa: E402
%reload_ext autoreload
%autoreload 2

import os 
import sys

import rootutils
import hydra

In [None]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

## Websites

---

There are three copora of websites in this dataset:

* `original`: 770 websites from the crowdsourced dataset from the [Homepage2Vec paper](https://arxiv.org/abs/1905.09786)
* `gpt`: 250 common websites obtained by prompting GPT-4 (see [Prompts](https://chat.openai.com/share/a76c8b9b-a659-4b15-9ab0-d94af4733d58))
* `curlie`: A filtered version of the [curlie](https://curlie.org) dataset, containing ~1M websites

For each website, the repository contains a CSV file at the path `data/raw/<corpus>.csv` with the two columns - `wid` and `url`. The `wid` is a unique identifier for the website, and the `url` is the URL of the website.

In [None]:
# Initialise Config
original_cfg = hydra.compose(config_name="eda", overrides=["data=original"])
gpt_cfg = hydra.compose(config_name="eda", overrides=["data=gpt"])
curlie_cfg = hydra.compose(config_name="eda", overrides=["data=curlie"])

### Original Data

This is the data that was used to test the model in the original paper. 

In [None]:
# Raw data
original_data = hydra.utils.instantiate(original_cfg.data)

raw_data = original_data.get_raw_data()
processed_data = original_data.get_processed_data()
embedded_data = original_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

### GPT

In [None]:
# Raw data
gpt_data = hydra.utils.instantiate(gpt_cfg.data)

raw_data = gpt_data.get_raw_data()
processed_data = gpt_data.get_processed_data()
embedded_jdata = gpt_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

### Curlie

In [None]:
# Raw data
curlie_data = hydra.utils.instantiate(gpt_cfg.data)

raw_data = curlie_data.get_raw_data()
processed_data = curlie_data.get_processed_data()
embedded_jdata = curlie_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)

In [None]:
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")

In [None]:
def get_num_featues(feat):
    return len([w[feat] for w in processed_data.values() if w[feat] is not None and w[feat] != []])

n = len(processed_data)
n_tld = get_num_featues("tld")
n_domain = get_num_featues("domain")
n_tags = get_num_featues("metatags")
n_titles = get_num_featues("title")
n_descriptions = get_num_featues("description")
n_keywords = get_num_featues("keywords")
n_links = get_num_featues("links")

# Print results
print(f"ℹ️ Number of sites with TLD: {n_tld/n*100:.2f}%")
print(f"ℹ️ Number of sites with domain: {n_domain/n*100:.2f}%")
print(f"ℹ️ Number of sites with tags: {n_tags/n*100:.2f}%")
print(f"ℹ️ Number of sites with title: {n_titles/n*100:.2f}%")
print(f"ℹ️ Number of sites with description: {n_descriptions/n*100:.2f}%")
print(f"ℹ️ Number of sites with keywords: {n_keywords/n*100:.2f}%")
print(f"ℹ️ Number of sites with links: {n_links/n*100:.2f}%")

## Labelers

---

There are multiple GPT labeler instances that can be used to label the data. The labelers are defined in the labelers module. These are:

* `human`: Uses human labels from the [Homepage2Vec paper](https://arxiv.org/abs/1905.09786) -- only works for the `original` corpus
* `gpt-labeler1`: Uses information on the `tld`, `domain`, `metatags`
* `gpt-labeler2`: Uses information like `gpt-labeler1` +  `title`, `description`, `keywords`
* `gpt-labeler3`: Uses information like `gpt-labeler2` +  `links` and `text`

In [None]:
# TODO