# Data Loading

This notebook shows how to download/ load the data and models which are used in
this project.


In [1]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

# External imports
import os
import time
import logging

# Internal imports
import ml_project_2_mlp.utils as utils

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Demo: Homepage2Vec model
homepage2vec_path = os.path.join(conf.MODELS_PATH, "homepage2vec")
os.system(f"rm {homepage2vec_path}/*")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)

print(f"\n✅ (Down)loaded homepage2vec model ({(time.time() - start):.2f}s).")

start = time.time()
model, features = utils.load_homepage2vec(homepage2vec_path)
print(f"✅ Loaded homepage2vec model ({(time.time() - start):.2f}s).")

In [18]:
# Demo: Raw Crowdsourced data
raw_crowdsourced_data_path = os.path.join("..", "data", "crowdsourced", "raw")
expected_files = ["labeled.csv", "categories.json"]
os.system(f"rm {raw_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=raw_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1U1mDeKOkkdn0yVOGOEUWE7OQcZ_JAV-w&export=download",
    expected_files=expected_files,
)
print(
    f"\n✅ (Down)loaded raw crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
labeled, categories = utils.load(
    dir_path=raw_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded raw crowdsourced data ({(time.time() - start):.2f}s).")

labeled.head()

rm: ../data/crowdsourced/raw/*: No such file or directory
Downloading...
From: https://drive.google.com/u/0/uc?id=1U1mDeKOkkdn0yVOGOEUWE7OQcZ_JAV-w&export=download
To: /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/crowdsourced/raw.zip
100%|██████████| 267k/267k [00:00<00:00, 5.34MB/s]


✅ (Down)loaded raw crowdsourced data (1.10s).
✅ Loaded raw crowdsourced data (0.02s).





Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,Reward,CreationTime,MaxAssignments,RequesterAnnotation,AssignmentDurationInSeconds,...,Input.uid,Input.url,Input.screenshot,Input.title,Input.description,Input.class,Input.bin,Answer.taskAnswers,Approve,Reject
0,3U74KRR67M668DZKK1B6TSTOB6QTNJ,3WIABKGAUQLG11JEPH5ADA0NHT8E66,Select all categories that are relevant for th...,"Given a screenshot, title, and description of ...","categorize, websites, topic",$0.10,Thu Jun 03 06:06:21 PDT 2021,3,BatchId:4464614;OriginalHitTemplateId:928390873;,3600,...,1161124,www.pointlesssites.com,1161124.jpeg,PointlessSites.com Fun Things To Do When You'...,Are you bored? Want something fun to do? Check...,2,3,"[{""category-0"":""NO"",""category-1"":""NO"",""categor...",,
1,3U74KRR67M668DZKK1B6TSTOB6QTNJ,3WIABKGAUQLG11JEPH5ADA0NHT8E66,Select all categories that are relevant for th...,"Given a screenshot, title, and description of ...","categorize, websites, topic",$0.10,Thu Jun 03 06:06:21 PDT 2021,3,BatchId:4464614;OriginalHitTemplateId:928390873;,3600,...,1161124,www.pointlesssites.com,1161124.jpeg,PointlessSites.com Fun Things To Do When You'...,Are you bored? Want something fun to do? Check...,2,3,"[{""category-0"":""NO"",""category-1"":""NO"",""categor...",,
2,3U74KRR67M668DZKK1B6TSTOB6QTNJ,3WIABKGAUQLG11JEPH5ADA0NHT8E66,Select all categories that are relevant for th...,"Given a screenshot, title, and description of ...","categorize, websites, topic",$0.10,Thu Jun 03 06:06:21 PDT 2021,3,BatchId:4464614;OriginalHitTemplateId:928390873;,3600,...,1161124,www.pointlesssites.com,1161124.jpeg,PointlessSites.com Fun Things To Do When You'...,Are you bored? Want something fun to do? Check...,2,3,"[{""category-0"":""YES"",""category-1"":""NO"",""catego...",,
3,3ZURAPD2888TB1AUKU8JFH1KHWE1FC,3WIABKGAUQLG11JEPH5ADA0NHT8E66,Select all categories that are relevant for th...,"Given a screenshot, title, and description of ...","categorize, websites, topic",$0.10,Thu Jun 03 06:06:31 PDT 2021,3,BatchId:4464614;OriginalHitTemplateId:928390873;,3600,...,1081241,www.connecticutplastics.com,1081241.jpeg,"Medical Manufacturing Orthopaedic, Medical, S...",MW Life Sciences offers advanced medical manuf...,1,3,"[{""category-0"":""NO"",""category-1"":""YES"",""catego...",,
4,3ZURAPD2888TB1AUKU8JFH1KHWE1FC,3WIABKGAUQLG11JEPH5ADA0NHT8E66,Select all categories that are relevant for th...,"Given a screenshot, title, and description of ...","categorize, websites, topic",$0.10,Thu Jun 03 06:06:31 PDT 2021,3,BatchId:4464614;OriginalHitTemplateId:928390873;,3600,...,1081241,www.connecticutplastics.com,1081241.jpeg,"Medical Manufacturing Orthopaedic, Medical, S...",MW Life Sciences offers advanced medical manuf...,1,3,"[{""category-0"":""NO"",""category-1"":""NO"",""categor...",,


In [19]:
# Demo: Processed Crowdsourced data
processed_crowdsourced_data_path = os.path.join(
    "..", "data", "crowdsourced", "processed"
)
expected_files = ["websites.csv"]
os.system(f"rm {processed_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=processed_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1Hyg6ASSVIdUHXagx2TUWwjXVIGUTIew_&export=download",
    expected_files=expected_files,
)
print(
    f"\n✅ (Down)loaded processed crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
websites = utils.load(
    dir_path=processed_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

websites[0].head()

rm: ../data/crowdsourced/processed/*: No such file or directory
Downloading...
From: https://drive.google.com/u/0/uc?id=1Hyg6ASSVIdUHXagx2TUWwjXVIGUTIew_&export=download
To: /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/crowdsourced/processed.zip
100%|██████████| 13.9k/13.9k [00:00<00:00, 28.1MB/s]


✅ (Down)loaded processed crowdsourced data (0.90s).
✅ Loaded crowdsourced data (0.00s).





Unnamed: 0,Input.uid,Input.url,Arts,Business,Computers,Games,Health,Home,Kids_and_Teens,News,Recreation,Reference,Science,Shopping,Society,Sports
0,125542,www.arbeidstilsynet.no,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,246754,www.openluchtmuseum.nl,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,290883,www.sight.mksat.net,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,312868,daarb.narod.ru,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,382929,www.inexmoda.org.co,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# Demo: Embedded Crowdsourced data
embedded_crowdsourced_data_path = os.path.join("..", "data", "crowdsourced", "embedded")
expected_files = ["embeddings.pt", "labels.pt"]
os.system(f"rm {embedded_crowdsourced_data_path}/*")

start = time.time()
utils.download_if_not_present(
    dir_path=embedded_crowdsourced_data_path,
    gdrive_url="https://drive.google.com/u/0/uc?id=1bYLc6DvZZT7JGVrSZjt54Ciw7qH6MMWn&export=download",
    expected_files=expected_files,
)
print(f"\n✅ (Down)loaded embedded crowdsourced data ({(time.time() - start):.2f}s).")

start = time.time()
embeddings, labels = utils.load(
    dir_path=embedded_crowdsourced_data_path,
    expected_files=expected_files,
)
print(f"✅ Loaded crowdsourced data ({(time.time() - start):.2f}s).")

embeddings.shape, labels.shape

rm: ../data/crowdsourced/embedded/*: No such file or directory
Downloading...
From: https://drive.google.com/u/0/uc?id=1bYLc6DvZZT7JGVrSZjt54Ciw7qH6MMWn&export=download
To: /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/crowdsourced/embedded.zip
100%|██████████| 10.2M/10.2M [00:00<00:00, 11.2MB/s]


✅ (Down)loaded embedded crowdsourced data (2.52s).
✅ Loaded crowdsourced data (0.00s).





(torch.Size([769, 4665]), torch.Size([769, 14]))

In [None]:
# Demo: Curlie data (excludes raw HTML content)
curlie_path = os.path.join(conf.DATA_PATH, "curlie")
os.system(f"rm {curlie_path}/*")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"\n✅ (Down)loaded Curlie data ({(time.time() - start):.2f}s).")

start = time.time()
curlie_data = utils.load_curlie_data(curlie_path)

print(f"✅ Loaded Curlie data ({(time.time() - start):.2f}s).")