In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchsummary import summary
import pandas as pd
import numpy as np

# Utility
import time
import wandb

from models.custom import RatingDataset
from models.cb import FeatureGenerator
from models.hybrid import EmbededHybridNet
from models.training import train_model, test_model
from utility import Mapper

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)
wandb.login()

Using cuda


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdat-tht225482[0m ([33mdat-tht225482-hust[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Dataset

In [2]:
DATASET = "merged/full"
mapper = Mapper.load(f"./database/{DATASET}/pydata/mapper.pkl")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
data = RatingDataset(f"./database/{DATASET}", full_mapper, True)
print("Number of users:", NUM_USER := len(mapper.user_fwd_map))
print("Number of items:", NUM_ITEM := len(mapper.item_fwd_map))

Number of users: 200948
Number of items: 84432


## Hybrid Model Training

In [3]:
# Training parameters
EPOCHS = 20
BATCH = 4096
SEED = 291124

In [4]:
wandb.init(
    project="movie-recommendation-models",
    resume="allow",
    config={
        "dataset": DATASET,
        "seed": SEED,
        "batch_size": BATCH,
        "epochs": EPOCHS,
        "device": DEVICE,
    },
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113010955555561, max=1.0…

Train-Test Split

In [5]:
train, test = data.split(0.8, seed=SEED)
train_loader = DataLoader(train, batch_size=BATCH, shuffle=True)
test_loader = DataLoader(test, batch_size=BATCH)

In [6]:
def init_uniform_rule(m):
    if isinstance(m, nn.Linear):
        # get the number of the inputs
        n = m.in_features
        y = 1.0 / np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)

In [7]:
genres = torch.load("./models/cb/genres_with_ratings.pt").to(DEVICE)
titles_and_plots = torch.load("./models/cb/titles_and_plots.pt").to(DEVICE)
directors_and_casts = torch.load("./models/cb/directors_and_cast.pt").to(DEVICE)
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
# model.apply(init_uniform_rule)
model.load_state_dict(torch.load("./model.pth"))
loss_fn = nn.MSELoss()
l1 = nn.L1Loss()
optimizer = optim.SGD(model.parameters())
best_val_loss = float("inf")
wandb.watch(model)

In [8]:
model

EmbededHybridNet(
  (item_weighted_genres): FeatureGenerator()
  (item_titles_plots): FeatureGenerator()
  (item_directors_casts): FeatureGenerator()
  (user_weighted_genres): Embedding(200948, 20, sparse=True)
  (user_titles_plots): Embedding(200948, 2280, sparse=True)
  (user_directors_casts): Embedding(200948, 482, sparse=True)
  (dense_weighted_genres): Linear(in_features=40, out_features=2, bias=True)
  (dense_titles_plots): Linear(in_features=4560, out_features=20, bias=True)
  (dense_directors_casts): Linear(in_features=964, out_features=10, bias=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)

In [9]:
def logging(epoch, train_loss, train_mae, test_loss, test_mae, time):
    time = int(time)
    logs = ""
    logs += f"Epoch {epoch}: "
    logs += f"Train Loss: {train_loss:.4e} | "
    logs += f"Train MAE: {train_mae:.4e} | "
    logs += f"Test Loss: {test_loss:.4e} | "
    logs += f"Test MAE: {test_mae:.4e} | "
    logs += f"Time Taken: {time // 60}m {time % 60:02d}s"
    wandb.log(
        {
            "epoch": epoch,
            "train_loss": train_loss,
            "train_mae": train_mae,
            "test_loss": test_loss,
            "test_mae": test_mae,
        }
    )
    print(logs, end=" ")

In [10]:
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_mae = train_model(
        epoch, model, train_loader, loss_fn, optimizer, DEVICE
    )
    test_loss, test_mae = test_model(epoch, model, test_loader, loss_fn, DEVICE)
    time_taken = time.time() - start_time
    logging(epoch, train_loss, train_mae, test_loss, test_mae, time_taken)
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model.state_dict(), "model.pth")
        print("> Checkpoint saved!")
    else:
        print()

Epoch 1 --- Training: Batch 948/6251

KeyboardInterrupt: 

In [None]:
wandb.finish()

## Get recommendation

In [21]:
movies = pd.read_csv("./database/merged/metadatas.csv")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
# model.load_state_dict(torch.load("./models/hybrid/embeded_hybrid.pth"))
model.load_state_dict(torch.load("./model.pth"))

<All keys matched successfully>

In [22]:
def get_recommendation(user, model: nn.Module, top: int = 10):
    user_idx = int(full_mapper.user_fwd_map[user])
    num_items = len(full_mapper.item_fwd_map)
    users = torch.tensor([user_idx] * num_items, dtype=torch.int64, device=DEVICE)
    movies = torch.tensor(range(num_items), dtype=torch.int64, device=DEVICE)
    model.eval()
    with torch.no_grad():
        ratings: torch.Tensor = model(users, movies)
    ratings = [
        (full_mapper.item_inv_map[i], float(r.item()))
        for i, r in zip(range(num_items), ratings.to("cpu"))
    ]
    ratings = sorted(ratings, key=lambda x: x[1], reverse=True)[:top]
    return ratings

In [23]:
user_id = 15375
user_rated = data.dataset[data.dataset["UserID"] == user_id]
user_rated = pd.merge(movies[["MovieID", "Title"]], user_rated, on="MovieID", how="right")
user_rated = user_rated.sort_values(by="Rating", ascending=False)
user_rated = user_rated.reindex(columns=["Title", "MovieID", "Rating"])
user_rated.sample(10)

Unnamed: 0,Title,MovieID,Rating
40,V for Vendetta (2006),44191,4.5
130,Curse of Aurore (2020),239172,2.5
42,"Da Vinci Code, The (2006)",45447,5.0
20,"Transporter, The (2002)",5574,4.0
24,Bruce Almighty (2003),6373,3.5
111,Belzebuth (2019),204584,2.5
72,"Expendables 2, The (2012)",91485,4.0
164,The Dark Web Tapes (2020),287423,0.5
54,Angels & Demons (2009),68554,4.5
76,Apartment 143 (2011),96634,2.0


In [None]:
for m_id, rate in get_recommendation(user_id, model):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], rate)

318 Shawshank Redemption, The (1994) 0.7871029376983643
858 Godfather, The (1972) 0.7841407656669617
1464 Lost Highway (1997) 0.7833184599876404
3435 Double Indemnity (1944) 0.7824506759643555
58559 Dark Knight, The (2008) 0.7813116312026978
4848 Mulholland Drive (2001) 0.778712272644043
1221 Godfather: Part II, The (1974) 0.7774827480316162
44761 Brick (2005) 0.7748100161552429
183461 Godless (2017) 0.7734577655792236
4406 Man Who Shot Liberty Valance, The (1962) 0.7712690234184265
