# Homepage2Vec Demo


In [1]:
# ruff: noqa
%reload_ext autoreload
%autoreload 2

import os
import torch

from homepage2vec.model import WebsiteClassifier as Homepage2Vec
from ml_project_2_mlp.homepage2vec.model import WebsiteClassifier as LocalHomepage2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths
ROOT_DIR = ".."
MODEL_DIR = os.path.join(ROOT_DIR, "models")
HOMEPAGE2VEC_DIR = os.path.join(MODEL_DIR, "homepage2vec")
GPT_3_DIR = os.path.join(MODEL_DIR, "finetuned", "gpt3.5")
GPT_4_DIR = os.path.join(MODEL_DIR, "finetuned", "gpt4")

# Checkpoint paths
# CKPT_GPT3_PATH = "/Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/logs/train/multiruns/2023-12-21_10-50-22/66/checkpoints/epoch_031.ckpt" 
# CKPT_GPT4_PATH = "/Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/logs/train/multiruns/2023-12-21_10-50-22/73/checkpoints/epoch_046.ckpt"

In [3]:
# Website to classify
WEBSITE = "https://www.epfl.ch"

In [4]:
def get_state_dict(ckpt_path: str) -> dict:
    """
    Load state dict from PyTorch Lightning checkpoint.
    """
    checkpoint = torch.load(ckpt_path)
    state_dict = checkpoint["state_dict"]
    state_dict = {k.replace("model.", ""): v for k, v in state_dict.items() if "model" in k}
    return state_dict

def sort_scores(scores: dict) -> dict:
    """Sort scores by value in descending order."""
    return {k: v for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True)}

### Homepage2Vec

A quick demo of the online version of Homepage2Vec.

In [5]:
# Initialise model
model = Homepage2Vec()

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.9571831822395325
Reference: 0.9026567935943604
Kids_and_Teens: 0.5716081857681274
Arts: 0.5242454409599304
Society: 0.27132657170295715
Business: 0.08574333041906357
News: 0.04988005757331848
Computers: 0.02889983355998993
Recreation: 0.02259485051035881
Health: 0.009944452904164791
Shopping: 0.0012295827036723495
Sports: 0.0008726614178158343
Home: 0.00036933409865014255
Games: 1.9969978893641382e-05


### Finetuned Homepage2Ve

In this section we are loading in the weights from our **finetuned model** to check that everything works as expected.

**Finetuned on `curlie-gpt3.5-10k`**

In [6]:
# Initialise model
model = LocalHomepage2Vec(model_dir=GPT_3_DIR)

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.7964304685592651
Reference: 0.7635273933410645
Society: 0.5921807289123535
News: 0.5681739449501038
Arts: 0.5368830561637878
Kids_and_Teens: 0.508224606513977
Computers: 0.3853667974472046
Business: 0.3469756543636322
Health: 0.33045274019241333
Recreation: 0.2595757842063904
Sports: 0.11109738796949387
Home: 0.10465463995933533
Shopping: 0.06366318464279175
Games: 0.031982336193323135


**Finetuned on `curlie-gpt4-10k`**

In [7]:
# Initialise model
model = LocalHomepage2Vec(model_dir=GPT_4_DIR)

# Website to predict
website = model.fetch_website(WEBSITE)

# Obtain scores and embeddings
scores, embeddings = model.predict(website)

print("Classes probabilities:")
print("\n".join(f"{k}: {v}" for k, v in sort_scores(scores).items()))

Classes probabilities:
Science: 0.9800454378128052
Reference: 0.7734379768371582
Society: 0.369582861661911
Business: 0.17371125519275665
Recreation: 0.12732912600040436
Computers: 0.0544419139623642
Health: 0.02717871218919754
Arts: 0.008378472179174423
Sports: 0.007091645151376724
Kids_and_Teens: 0.002522727008908987
News: 0.00039661736809648573
Home: 0.000121747434604913
Shopping: 6.818987458245829e-05
Games: 1.6320313079631887e-05
