### Curlie 10k  Release

---

In this notebook, we prepare the annotated Curlie dataset for the public release.

In [1]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

import json
import os

import hydra
import rootutils

import pandas as pd

In [2]:
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)

In [3]:
DATA_DIR = os.path.join("..", "data")
RELEASE_DIR = os.path.join(DATA_DIR, "release")
os.makedirs(RELEASE_DIR, exist_ok=True)

### Load the data

---

#### Features

In [4]:
# Load config
curlie_cfg = hydra.compose(config_name="eda", overrides=["data=curlie"])

# Get all data
curlie_data = hydra.utils.instantiate(curlie_cfg.data)

# Get data for the individual stages of the pipeline
raw_data = curlie_data.get_raw_data()
processed_data = curlie_data.get_processed_data()
embedded_data = curlie_data.get_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


#### Labels

In [5]:
# Initialise configuration for all labelers
gpt_labeler_names = [
    "gpt3.5-oneshot-context2",
    "gpt4-zeroshot-context2",

]
gpt_labelers_cfg = {labeler: hydra.compose(config_name="eda", overrides=[f"labeler={labeler}"]) for labeler in gpt_labeler_names}

# Instantiate labelers
gpt_labelers = {labeler: hydra.utils.instantiate(cfg.labeler, data=curlie_data) for labeler, cfg in gpt_labelers_cfg.items()}

#### Categories

In [6]:
# Load categories
path = os.path.join(root_path, "data", "meta", "categories.json")
with open(path) as f:
    categories = list(json.load(f).keys())

### Prepare the data

---

For each labeler, we will have a dataset which will include `wid`, `url` and `one-hot` encoded labels.

In [7]:
dfs = []
for name, labeler in gpt_labelers.items():
    labels = labeler.get_labels()
    rows = []
    for wid, label in labels.items():
        url = raw_data[raw_data["wid"] == int(wid)]["url"]
        if len(url) == 0:
            continue
        else:
            url = url.values[0]
        row = {"wid": wid, "url": url}
        onehot = {categories[i]: v for i, v in enumerate(label["labels"])}
        row.update(onehot)
        rows.append(row)
    df = pd.DataFrame(rows)
    dfs.append((name, df))

Finally, save these into the predefine location:

In [9]:
for name, df in dfs:
    path = os.path.join(RELEASE_DIR, f"curlie-10k-{name.split('-')[0]}.csv")
    df.to_csv(path, index=False)

---