# Homepage2Vec Demo


In [1]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

import os
import torch

from homepage2vec.model import WebsiteClassifier as Homepage2Vec
from ml_project_2_mlp.homepage2vec.model import WebsiteClassifier as LocalHomepage2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths
ROOT_DIR = ".."
MODEL_DIR = os.path.join(ROOT_DIR, "models")
HOMEPAGE2VEC_DIR = os.path.join(MODEL_DIR, "homepage2vec")
GPT_3_DIR = os.path.join(MODEL_DIR, "finetuned", "gpt3.5")
GPT_4_DIR = os.path.join(MODEL_DIR, "finetuned", "gpt4")

# Checkpoint paths
CKPT_GPT3_PATH = "/Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/logs/train/multiruns/2023-12-19_00-31-07/18/checkpoints/epoch_031.ckpt" 
CKPT_GPT4_PATH = "/Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/logs/train/multiruns/2023-12-20_22-48-11/9/checkpoints/epoch_039.ckpt"

In [3]:
# Website to classify
WEBSITE = "https://www.epfl.ch"

In [4]:
def sort_scores(scores: dict) -> dict:
    """Sort scores by value in descending order."""
    return {k: v for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True)}

### Homepage2Vec

A quick demo of the online version of Homepage2Vec.

In [5]:
# Initialise model
model = Homepage2Vec()

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.9571831822395325
Reference: 0.9026567935943604
Kids_and_Teens: 0.5716081857681274
Arts: 0.5242454409599304
Society: 0.27132657170295715
Business: 0.08574333041906357
News: 0.04988005757331848
Computers: 0.02889983355998993
Recreation: 0.02259485051035881
Health: 0.009944452904164791
Shopping: 0.0012295827036723495
Sports: 0.0008726614178158343
Home: 0.00036933409865014255
Games: 1.9969978893641382e-05


### Finetuned Homepage2Ve

In this section we are loading in the weights from our **finetuned model** to check that everything works as expected.

**Finetuned on `curlie-gpt3.5-10k`**

In [6]:
# Initialise model
model = LocalHomepage2Vec(model_dir=GPT_3_DIR)

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.8412251472473145
Reference: 0.7838622331619263
Society: 0.5793622732162476
Kids_and_Teens: 0.4533769190311432
Arts: 0.44181838631629944
Computers: 0.4027433693408966
News: 0.37343618273735046
Health: 0.3063579201698303
Business: 0.24803754687309265
Recreation: 0.19114521145820618
Sports: 0.0962551087141037
Home: 0.037589993327856064
Shopping: 0.028511211276054382
Games: 0.028346922248601913


**Finetuned on `curlie-gpt4-10k`**

In [7]:
# Initialise model
model = LocalHomepage2Vec(model_dir=GPT_4_DIR)

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.9493160843849182
Reference: 0.659612774848938
Society: 0.49875521659851074
Business: 0.3018222749233246
Computers: 0.18622104823589325
News: 0.09135214984416962
Recreation: 0.08869662880897522
Arts: 0.05620817095041275
Kids_and_Teens: 0.04272356256842613
Health: 0.03221626207232475
Sports: 0.01675873063504696
Home: 0.003997097257524729
Shopping: 0.002112273359671235
Games: 0.0012009597849100828
