In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchsummary import summary
import pandas as pd

# Utility
import time
import wandb

from models.custom import RatingDataset
from models.cb import FeatureGenerator
from models.hybrid import EmbededHybridNet
from models.training import train_model, test_model
from utility import Mapper

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)
wandb.login()

Using cuda


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdat-tht225482[0m ([33mdat-tht225482-hust[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Dataset

In [2]:
DATASET = "merged/full"
mapper = Mapper.load(f"./database/{DATASET}/pydata/mapper.pkl")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
data = RatingDataset(f"./database/{DATASET}", full_mapper, True)
print("Number of users:", NUM_USER := len(mapper.user_fwd_map))
print("Number of items:", NUM_ITEM := len(mapper.item_fwd_map))

Number of users: 200948
Number of items: 84432


## Hybrid Model Training

In [3]:
# Training parameters
EPOCHS = 40
BATCH = 4096
SEED = 291124

In [4]:
wandb.init(
    project="movie-recommendation-models",
    resume="allow",
    config={
        "dataset": DATASET,
        "seed": SEED,
        "batch_size": BATCH,
        "epochs": EPOCHS,
        "device": DEVICE,
    },
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113115211143547, max=1.0…

Train-Test Split

In [5]:
train, test = data.split(0.8, seed=SEED)
train_loader = DataLoader(train, batch_size=BATCH, shuffle=True, num_workers=3)
test_loader = DataLoader(test, batch_size=BATCH, num_workers=3)

In [6]:
genres = torch.load("./models/cb/genres_with_ratings.pt").to(DEVICE)
titles_and_plots = torch.load("./models/cb/titles_and_plots.pt").to(DEVICE)
directors_and_casts = torch.load("./models/cb/directors_and_cast.pt").to(DEVICE)
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
loss_fn = nn.MSELoss()
l1 = nn.L1Loss()
optimizer = optim.SGD(model.parameters())
best_val_loss = float("inf")
wandb.watch(model)

In [7]:
model

EmbededHybridNet(
  (item_weighted_genres): FeatureGenerator()
  (item_titles_plots): FeatureGenerator()
  (item_directors_casts): FeatureGenerator()
  (user_weighted_genres): Embedding(200948, 20, sparse=True)
  (user_titles_plots): Embedding(200948, 702, sparse=True)
  (user_directors_casts): Embedding(200948, 485, sparse=True)
  (dense_weighted_genres): Linear(in_features=40, out_features=2, bias=True)
  (dense_titles_plots): Linear(in_features=1404, out_features=2, bias=True)
  (dense_directors_casts): Linear(in_features=970, out_features=2, bias=True)
  (out): Linear(in_features=6, out_features=1, bias=True)
  (relu): ReLU()
)

In [8]:
def logging(epoch, train_loss, train_mae, test_loss, test_mae, time):
    time = int(time)
    logs = ""
    logs += f"Epoch {epoch}: "
    logs += f"Train Loss: {train_loss:.4e} | "
    logs += f"Train MAE: {train_mae:.4e} | "
    logs += f"Test Loss: {test_loss:.4e} | "
    logs += f"Test MAE: {test_mae:.4e} | "
    logs += f"Time Taken: {time // 60}m {time % 60:02d}s"
    wandb.log(
        {
            "epoch": epoch,
            "train_loss": train_loss,
            "train_mae": train_mae,
            "test_loss": test_loss,
            "test_mae": test_mae,
        }
    )
    print(logs, end=" ")

In [9]:
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_mae = train_model(
        epoch, model, train_loader, loss_fn, optimizer, DEVICE
    )
    test_loss, test_mae = test_model(epoch, model, test_loader, loss_fn, DEVICE)
    time_taken = time.time() - start_time
    logging(epoch, train_loss, train_mae, test_loss, test_mae, time_taken)
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model.state_dict(), "model.pth")
        print("> Checkpoint saved!")
    else:
        print()

Epoch 1: Train Loss: 7.5762e-02 | Train MAE: 2.1720e-01 | Test Loss: 5.1840e-02 | Test MAE: 1.8050e-01 | Time Taken: 10m 22s > Checkpoint saved!
Epoch 2: Train Loss: 4.8513e-02 | Train MAE: 1.7415e-01 | Test Loss: 4.6372e-02 | Test MAE: 1.6999e-01 | Time Taken: 9m 08s > Checkpoint saved!
Epoch 3: Train Loss: 4.5322e-02 | Train MAE: 1.6794e-01 | Test Loss: 4.4443e-02 | Test MAE: 1.6621e-01 | Time Taken: 7m 55s > Checkpoint saved!
Epoch 4: Train Loss: 4.3898e-02 | Train MAE: 1.6513e-01 | Test Loss: 4.3378e-02 | Test MAE: 1.6408e-01 | Time Taken: 9m 41s > Checkpoint saved!
Epoch 5: Train Loss: 4.3029e-02 | Train MAE: 1.6338e-01 | Test Loss: 4.2664e-02 | Test MAE: 1.6263e-01 | Time Taken: 7m 51s > Checkpoint saved!
Epoch 6: Train Loss: 4.2410e-02 | Train MAE: 1.6210e-01 | Test Loss: 4.2123e-02 | Test MAE: 1.6152e-01 | Time Taken: 7m 30s > Checkpoint saved!
Epoch 7: Train Loss: 4.1922e-02 | Train MAE: 1.6106e-01 | Test Loss: 4.1680e-02 | Test MAE: 1.6055e-01 | Time Taken: 9m 20s > Checkpoin

In [10]:
wandb.finish()

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_mae,█▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,40.0
test_loss,0.03838
test_mae,0.152
train_loss,0.0384
train_mae,0.15206


## Get recommendation

In [11]:
movies = pd.read_csv("./database/merged/metadatas.csv")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
model = EmbededHybridNet(NUM_USER, genres, titles_and_plots, directors_and_casts).to(DEVICE)
# model.load_state_dict(torch.load("./models/hybrid/embeded_hybrid.pth"))
model.load_state_dict(torch.load("./model.pth"))

<All keys matched successfully>

In [12]:
def get_recommendation(user, model: nn.Module, top: int = 10):
    user_idx = int(full_mapper.user_fwd_map[user])
    num_items = len(full_mapper.item_fwd_map)
    users = torch.tensor([user_idx] * num_items, dtype=torch.int64, device=DEVICE)
    movies = torch.tensor(range(num_items), dtype=torch.int64, device=DEVICE)
    model.eval()
    with torch.no_grad():
        ratings: torch.Tensor = model(users, movies)
    ratings = [
        (full_mapper.item_inv_map[i], float(r.item()))
        for i, r in zip(range(num_items), ratings.to("cpu"))
    ]
    ratings = sorted(ratings, key=lambda x: x[1], reverse=True)[:top]
    return ratings

In [13]:
user_id = 15375
user_rated = data.dataset[data.dataset["UserID"] == user_id]
user_rated = pd.merge(movies[["MovieID", "Title"]], user_rated, on="MovieID", how="right")
user_rated = user_rated.sort_values(by="Rating", ascending=False)
user_rated = user_rated.reindex(columns=["Title", "MovieID", "Rating"])
user_rated.sample(10)

Unnamed: 0,Title,MovieID,Rating
12,Along Came a Spider (2001),4238,4.0
97,The Waiting (2016),156903,4.0
59,Triangle (2009),74228,4.0
147,A House on the Bayou (2021),266756,3.5
163,Death Count (2022),287421,1.0
112,The Odds (2019),206135,4.0
82,"Conjuring, The (2013)",103688,5.0
159,The Pope's Exorcist (2023),286097,4.0
51,Taken (2008),59369,4.0
100,Pyewacket (2017),177611,2.5


In [14]:
for m_id, rate in get_recommendation(user_id, model):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], rate)

159817 Planet Earth (2006) 0.9146255254745483
171011 Planet Earth II (2016) 0.9060686826705933
318 Shawshank Redemption, The (1994) 0.9021295309066772
858 Godfather, The (1972) 0.8916270732879639
170705 Band of Brothers (2001) 0.8788440227508545
527 Schindler's List (1993) 0.8732806444168091
1221 Godfather: Part II, The (1974) 0.8714938163757324
1203 12 Angry Men (1957) 0.8691617250442505
296 Pulp Fiction (1994) 0.8582024574279785
356 Forrest Gump (1994) 0.8548520803451538
