# Data Loading

This notebook shows how to download/ load the data and models which are used in
this project.


In [1]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# External imports
import os
import time
import logging

# Internal imports
import ml_project_2_mlp.utils as utils
import ml_project_2_mlp.conf as conf
import ml_project_2_mlp.log as log

In [2]:
# Set logging level to DEBUG (detailed logs)
logger = log.setup_logging(logging.INFO)

In [3]:
paths = [conf.MODELS_PATH, conf.DATA_PATH]

# Paths have to be created before loading the data
for path in paths:
    os.makedirs(path, exist_ok=True)

In [4]:
# Demo: Homepage2Vec model
homepage2vec_path = os.path.join(conf.MODELS_PATH, "homepage2vec")
os.system(f"rm {homepage2vec_path}/*")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)

print(f"\n✅ (Down)loaded homepage2vec model ({(time.time() - start):.2f}s).")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)
print(f"✅ Loaded homepage2vec model ({(time.time() - start):.2f}s).")

[INFO] (ml_project_2_mlp.utils._download_if_not_present) - /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/models/homepage2vec doesn't exist. Downloading from Google Drive...
Downloading...
From: https://drive.google.com/u/0/uc?id=17EAb6wgORzbu3xYAIkATzUu-hCKiP6A0&export=download
To: /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/models/homepage2vec.zip
100%|██████████| 17.7M/17.7M [00:01<00:00, 11.5MB/s]


✅ (Down)loaded homepage2vec model (3.24s).
✅ Loaded homepage2vec model (0.00s).





In [5]:
# Demo: Crowdsourced data
crowdsourced_data = os.path.join(conf.DATA_PATH, "crowdsourced")
os.system(f"rm {crowdsourced_data}/*")

start = time.time()
labeled, categories = utils.load_crowdsourced_data(crowdsourced_data)

print(f"\n✅ (Down)loaded crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
labeled, categories = utils.load_crowdsourced_data(crowdsourced_data)

print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

[INFO] (ml_project_2_mlp.utils._download_if_not_present) - /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/crowdsourced doesn't exist. Downloading from Google Drive...
Downloading...
From: https://drive.google.com/u/0/uc?id=1JUU2YyY9uX4kH7-yYzmB6r9gzVtBBrfz&export=download
To: /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/crowdsourced.zip
100%|██████████| 269k/269k [00:00<00:00, 5.69MB/s]


✅ (Down)loaded crowdsourced data (1.55s).
✅ Loaded crowdsourced data (0.02s).





In [7]:
# Demo: Curlie data (excludes raw HTML content)
curlie_path = os.path.join(conf.DATA_PATH, "curlie")
# os.remove(homepage2vec_path, recursive=True) # Don't remove yet

# start = time.time()
# curlie_data = utils.load_curlie_data(curlie_path)

# print(f"\n✅ (Down)loaded Curlie data ({(time.time() - start):.2f}s).")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"✅ Loaded Curlie data ({(time.time() - start):.2f}s).")

✅ Loaded Curlie data (4.05s).
