# Data Loading

This notebook shows how to download/ load the data and models which are used in
this project.


In [None]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# External imports
import os
import time
import logging

# Internal imports
import ml_project_2_mlp.utils as utils
import ml_project_2_mlp.conf as conf
import ml_project_2_mlp.log as log

In [None]:
# Set logging level to INFO (detailed logs)
log.setup_logging(logging.INFO)

In [None]:
paths = [conf.MODELS_PATH, conf.DATA_PATH]

# Paths have to be created before loading the data
for path in paths:
    os.makedirs(path, exist_ok=True)

In [None]:
# Demo: Homepage2Vec model
homepage2vec_path = os.path.join(conf.MODELS_PATH, "homepage2vec")
os.system(f"rm {homepage2vec_path}/*")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)

print(f"\n✅ (Down)loaded homepage2vec model ({(time.time() - start):.2f}s).")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)
print(f"✅ Loaded homepage2vec model ({(time.time() - start):.2f}s).")

In [None]:
# Demo: Crowdsourced data
crowdsourced_data = os.path.join(conf.DATA_PATH, "crowdsourced")
os.system(f"rm {crowdsourced_data}/*")

start = time.time()
labeled, categories = utils.load_crowdsourced_data(crowdsourced_data)

print(f"\n✅ (Down)loaded crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
labeled, categories = utils.load_crowdsourced_data(crowdsourced_data)

print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

In [None]:
# Demo: Curlie data (excludes raw HTML content)
curlie_path = os.path.join(conf.DATA_PATH, "curlie")
os.system(f"rm {curlie_path}/*")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"\n✅ (Down)loaded Curlie data ({(time.time() - start):.2f}s).")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"✅ Loaded Curlie data ({(time.time() - start):.2f}s).")