In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchsummary import summary
import pandas as pd

# Utility
import time
import wandb

from models.custom import RatingDataset
from models.cb import FeatureGenerator
from models.training import train_model, test_model
from utility import Mapper

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)
wandb.login()

Using cuda


True

## Dataset

In [8]:
DATASET = "merged/full"
mapper = Mapper.load(f"./database/{DATASET}/pydata/mapper.pkl")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
data = RatingDataset(f"./database/{DATASET}", full_mapper, True)
print("Number of users:", NUM_USER := len(mapper.user_fwd_map))
print("Number of items:", NUM_ITEM := len(mapper.item_fwd_map))

Number of users: 200948
Number of items: 84432


## Hybrid Model Training

In [9]:
# Training parameters
EPOCHS = 20
BATCH = 4096
SEED = 291124

In [10]:
wandb.init(
    project="movie-recommendation-models",
    resume="allow",
    config={
        "dataset": DATASET,
        "seed": SEED,
        "batch_size": BATCH,
        "epochs": EPOCHS,
        "device": DEVICE,
    },
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113026155554609, max=1.0…

Train-Test Split

In [11]:
train, test = data.split(0.8, seed=SEED)
train_loader = DataLoader(train, batch_size=BATCH, shuffle=True, num_workers=3)
test_loader = DataLoader(test, batch_size=BATCH, num_workers=3)

In [12]:
f = FeatureGenerator(torch.load("./models/cb/genres_with_ratings.pt")).to(DEVICE)
n = nn.Embedding(NUM_USER, f.num_feature, sparse=True).to(DEVICE)

In [13]:
class EmbededHybridNet(nn.Module):

    def __init__(self, num_user: int, genres, titles_and_plots, directors_and_casts):
        super(EmbededHybridNet, self).__init__()
        # Item features
        self.item_weighted_genres = FeatureGenerator(genres)
        self.item_titles_plots = FeatureGenerator(titles_and_plots)
        self.item_directors_casts = FeatureGenerator(directors_and_casts)
        # User features
        self.user_weighted_genres = nn.Embedding(num_user, self.item_weighted_genres.num_feature, sparse=True)
        self.user_titles_plots = nn.Embedding(num_user, self.item_titles_plots.num_feature, sparse=True)
        self.user_directors_casts = nn.Embedding(num_user, self.item_directors_casts.num_feature, sparse=True)
        # Features dense layers
        self.dense_weighted_genres = nn.Linear(self.item_weighted_genres.num_feature * 2, 2)
        self.dense_titles_plots = nn.Linear(self.item_titles_plots.num_feature * 2, 2)
        self.dense_directors_casts = nn.Linear(self.item_directors_casts.num_feature * 2, 2)
        # Dense
        self.out = nn.Linear(6, 1, dtype=torch.float32)
        # Relu
        self.relu = nn.ReLU()

    def __call__(self, user, item):
        # Weighted genres
        user_weighted_genres = self.user_weighted_genres(user)
        item_weighted_genres = self.item_weighted_genres(item)
        weighted_genres = torch.cat([user_weighted_genres, item_weighted_genres], dim=1)
        weighted_genres = self.dense_weighted_genres(weighted_genres)
        # Tiltes and plots
        user_titles_plots = self.user_titles_plots(user)
        item_titles_plots = self.item_titles_plots(item)
        titles_plots = torch.cat([user_titles_plots, item_titles_plots], dim=1)
        titles_plots = self.dense_titles_plots(titles_plots)
        # Directors and casts
        user_directors_casts = self.user_directors_casts(user)
        item_directors_casts = self.item_directors_casts(item)
        directors_casts = torch.cat([user_directors_casts, item_directors_casts], dim=1)
        directors_casts = self.dense_directors_casts(directors_casts)
        # Final denses
        x = self.relu(torch.cat([weighted_genres, titles_plots, directors_casts], dim=1))
        return self.out(x)

In [14]:
genres = torch.load("./models/cb/genres_with_ratings.pt").to(DEVICE)
titles_and_plots = torch.load("./models/cb/titles_and_plots.pt").to(DEVICE)
directors_and_casts = torch.load("./models/cb/directors_and_cast.pt").to(DEVICE)
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
loss_fn = nn.MSELoss()
l1 = nn.L1Loss()
optimizer = optim.SGD(model.parameters())
best_val_loss = float("inf")
wandb.watch(model)

In [15]:
def logging(epoch, train_loss, train_mae, test_loss, test_mae, time):
    time = int(time)
    logs = ""
    logs += f"Epoch {epoch}: "
    logs += f"Train Loss: {train_loss:.4e} | "
    logs += f"Train MAE: {train_mae:.4e} | "
    logs += f"Test Loss: {test_loss:.4e} | "
    logs += f"Test MAE: {test_mae:.4e} | "
    logs += f"Time Taken: {time // 60}m {time % 60:02d}s"
    wandb.log(
        {
            "epoch": epoch,
            "train_loss": train_loss,
            "train_mae": train_mae,
            "test_loss": test_loss,
            "test_mae": test_mae,
        }
    )
    print(logs, end=" ")

In [16]:
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_mae = train_model(
        epoch, model, train_loader, loss_fn, optimizer, DEVICE
    )
    test_loss, test_mae = test_model(epoch, model, test_loader, loss_fn, DEVICE)
    time_taken = time.time() - start_time
    logging(epoch, train_loss, train_mae, test_loss, test_mae, time_taken)
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model.state_dict(), "model.pth")
        print("> Checkpoint saved!")
    else:
        print()

Epoch 1: Train Loss: 5.8364e-02 | Train MAE: 1.8958e-01 | Test Loss: 4.6521e-02 | Test MAE: 1.7106e-01 | Time Taken: 5m 57s > Checkpoint saved!
Epoch 2: Train Loss: 4.5904e-02 | Train MAE: 1.7009e-01 | Test Loss: 4.5425e-02 | Test MAE: 1.6924e-01 | Time Taken: 6m 03s > Checkpoint saved!
Epoch 3: Train Loss: 4.5240e-02 | Train MAE: 1.6884e-01 | Test Loss: 4.5040e-02 | Test MAE: 1.6840e-01 | Time Taken: 6m 01s > Checkpoint saved!
Epoch 4: Train Loss: 4.4983e-02 | Train MAE: 1.6824e-01 | Test Loss: 4.4875e-02 | Test MAE: 1.6798e-01 | Time Taken: 6m 13s > Checkpoint saved!
Epoch 5: Train Loss: 4.4868e-02 | Train MAE: 1.6794e-01 | Test Loss: 4.4793e-02 | Test MAE: 1.6777e-01 | Time Taken: 6m 02s > Checkpoint saved!
Epoch 6: Train Loss: 4.4806e-02 | Train MAE: 1.6778e-01 | Test Loss: 4.4747e-02 | Test MAE: 1.6764e-01 | Time Taken: 6m 06s > Checkpoint saved!
Epoch 7: Train Loss: 4.4770e-02 | Train MAE: 1.6769e-01 | Test Loss: 4.4718e-02 | Test MAE: 1.6757e-01 | Time Taken: 6m 05s > Checkpoint

In [17]:
wandb.finish()

VBox(children=(Label(value='0.025 MB of 0.025 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_mae,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
test_loss,0.04459
test_mae,0.16732
train_loss,0.04463
train_mae,0.16741


## Get recommendation

In [23]:
movies = pd.read_csv("./database/merged/metadatas.csv")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
# model.load_state_dict(torch.load("./models/hybrid/embeded_hybrid.pth"))
model.load_state_dict(torch.load("./model.pth"))

<All keys matched successfully>

In [24]:
def get_recommendation(user, model: nn.Module, top: int = 10):
    user_idx = int(full_mapper.user_fwd_map[user])
    num_items = len(full_mapper.item_fwd_map)
    users = torch.tensor([user_idx] * num_items, dtype=torch.int64, device=DEVICE)
    movies = torch.tensor(range(num_items), dtype=torch.int64, device=DEVICE)
    model.eval()
    with torch.no_grad():
        ratings: torch.Tensor = model(users, movies)
    ratings = [
        (full_mapper.item_inv_map[i], float(r.item()))
        for i, r in zip(range(num_items), ratings.to("cpu"))
    ]
    ratings = sorted(ratings, key=lambda x: x[1], reverse=True)[:top]
    return ratings

In [25]:
user_id = 10242
user_rated = data.dataset[data.dataset["UserID"] == user_id]
user_rated = pd.merge(movies[["MovieID", "Title"]], user_rated, on="MovieID", how="right")
user_rated = user_rated.sort_values(by="Rating", ascending=False)
user_rated = user_rated.reindex(columns=["Title", "MovieID", "Rating"])
user_rated.sample(10)

Unnamed: 0,Title,MovieID,Rating
23,War of the Worlds (2005),34048,4.5
49,The Revenant (2015),139385,5.0
41,"Double, The (2013)",105835,5.0
27,Children of Men (2006),48774,5.0
15,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",4973,5.0
32,Coraline (2009),66097,5.0
50,Arrival (2016),164179,5.0
56,The Ballad of Buster Scruggs (2018),193944,4.5
48,Mad Max: Fury Road (2015),122882,5.0
2,Star Wars: Episode V - The Empire Strikes Back...,1196,5.0


In [26]:
for m_id, rate in get_recommendation(user_id, model):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], rate)

26662 Kiki's Delivery Service (Majo no takkyûbin) (1989) 0.7105634808540344
2138 Watership Down (1978) 0.7100299000740051
76093 How to Train Your Dragon (2010) 0.7099891901016235
68954 Up (2009) 0.7094910740852356
5038 Flight of Dragons, The (1982) 0.7093707919120789
5971 My Neighbor Totoro (Tonari no Totoro) (1988) 0.7093394994735718
98243 Rise of the Guardians (2012) 0.7092991471290588
71745 Where the Wild Things Are (2009) 0.7092979550361633
364 Lion King, The (1994) 0.7090964913368225
62729 Niko & the Way to the Stars (a.k.a. The Flight Before Christmas) (Niko - Lentäjän poika) (2008) 0.7090679407119751
