# Data Loading

This notebook shows how to download/ load the data and models which are used in
this project.


In [None]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# External imports
import os
import time
from tqdm import tqdm
import pickle

# Internal imports
import ml_project_2_mlp.utils as utils

In [None]:
# Demo: Homepage2Vec model
homepage2vec_path = os.path.join(os.path.join("..", "models"), "homepage2vec")
expected_files = ["features.txt", "model.pt"]
os.system(f"rm {homepage2vec_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=homepage2vec_path,
    gdrive_url="https://drive.google.com/u/2/uc?id=1bE8ttkcgH9nMCobXIjPx05ezipklvsJ3&export=download",
    expected_files=expected_files
)

print(f"\n✅ (Down)loaded homepage2vec model ({(time.time() - start):.2f}s).")

start = time.time()
model, features = utils.load(
    dir_path=homepage2vec_path,
    expected_files=["model.pt", "features.txt"],
)
print(f"✅ Loaded homepage2vec model ({(time.time() - start):.2f}s).")

In [None]:
# Demo: Raw Crowdsourced data
raw_crowdsourced_data_path = os.path.join("..", "data", "crowdsourced", "raw")
expected_files = ["labeled.csv", "categories.json"]
os.system(f"rm {raw_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=raw_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1U1mDeKOkkdn0yVOGOEUWE7OQcZ_JAV-w&export=download",
    expected_files=expected_files,
)
print(
    f"\n✅ (Down)loaded raw crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
labeled, categories = utils.load(
    dir_path=raw_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded raw crowdsourced data ({(time.time() - start):.2f}s).")

labeled.head()

In [None]:
# Demo: Processed Crowdsourced data
processed_crowdsourced_data_path = os.path.join(
    "..", "data", "crowdsourced", "processed"
)
expected_files = ["websites.csv", "content.pkl"]
os.system(f"rm {processed_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=processed_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1Hyg6ASSVIdUHXagx2TUWwjXVIGUTIew_&export=download",
    expected_files=expected_files,
)
print(
    f"\n✅ (Down)loaded processed crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
websites, content = utils.load(
    dir_path=processed_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

websites[0].head()

In [None]:
# Demo: Embedded Crowdsourced data
embedded_crowdsourced_data_path = os.path.join("..", "data", "crowdsourced", "embedded")
expected_files = ["embeddings.pt", "labels.pt"]
os.system(f"rm {embedded_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=embedded_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1bYLc6DvZZT7JGVrSZjt54Ciw7qH6MMWn&export=download",
    expected_files=expected_files,
)
print(f"\n✅ (Down)loaded embedded crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
embeddings, labels = utils.load(
    dir_path=embedded_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

embeddings.shape, labels.shape

In [None]:
# Demo: Curlie data (excludes raw HTML content)
curlie_path = os.path.join(conf.DATA_PATH, "curlie")
os.system(f"rm {curlie_path}/*")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"\n✅ (Down)loaded Curlie data ({(time.time() - start):.2f}s).")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"✅ Loaded Curlie data ({(time.time() - start):.2f}s).")