In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchsummary import summary

import time
import wandb
import pandas as pd

from models.cf import EmbededRatingNet, EmbededDotNet
from models.custom import RatingDataset
from models.training import train_model, test_model
from utility import Mapper

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)

Using cuda


In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

## Dataset

In [3]:
DATASET = "merged/full"
mapper = Mapper.load(f"./database/{DATASET}/pydata/mapper.pkl")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
data = RatingDataset(f"./database/{DATASET}", full_mapper, True)
print("Number of users:", NUM_USER := len(mapper.user_fwd_map))
print("Number of items:", NUM_ITEM := len(mapper.item_fwd_map))

Number of users: 200948
Number of items: 84432


## Deep Model Training

In [4]:
# Model parameters
NUM_FACTOR = 8
# Training parameters
EPOCHS = 20
BATCH = 4096
SEED = 291124

In [5]:
wandb.init(
    project="movie-recommendation-models",
    resume="allow",
    config={
        "dataset": DATASET,
        "seed": SEED,
        "batch_size": BATCH,
        "epochs": EPOCHS,
        "num_factor": NUM_FACTOR,
        "device": DEVICE,
    },
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113895699963905, max=1.0…

Train-Test Split

In [6]:
train, test = data.split(0.8, seed=SEED)

In [7]:
train_loader = DataLoader(train, batch_size=BATCH, shuffle=True, num_workers=3)
test_loader = DataLoader(test, batch_size=BATCH, num_workers=3)

### CF Embeded Rating

In [8]:
# model = EmbededRatingNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model = EmbededDotNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
loss_fn = nn.MSELoss()
l1 = nn.L1Loss()
optimizer = optim.SGD(model.parameters())
best_val_loss = float("inf")
wandb.watch(model)

Model training

In [9]:
def logging(epoch, train_loss, train_mae, test_loss, test_mae, time):
    time  = int(time)
    logs = ""
    logs += f"Epoch {epoch}: "
    logs += f"Train Loss: {train_loss:.4e} | "
    logs += f"Train MAE: {train_mae:.4e} | "
    logs += f"Test Loss: {test_loss:.4e} | "
    logs += f"Test MAE: {test_mae:.4e} | "
    logs += f"Time Taken: {time // 60}m {time % 60:02d}s"
    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_mae": train_mae,
        "test_loss": test_loss,
        "test_mae": test_mae,
    })
    print(logs, end=" ")

In [10]:
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_mae = train_model(epoch, model, train_loader, loss_fn, optimizer, DEVICE)
    test_loss, test_mae = test_model(epoch, model, test_loader, loss_fn, DEVICE)
    time_taken = time.time() - start_time
    logging(epoch, train_loss, train_mae, test_loss, test_mae, time_taken)
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model.state_dict(), "model.pth")
        print("> Checkpoint saved!")
    else:
        print()

Epoch 1: Train Loss: 8.3640e+00 | Train MAE: 2.2496e+00 | Test Loss: 8.3076e+00 | Test MAE: 2.2418e+00 | Time Taken: 5m 29s > Checkpoint saved!
Epoch 2: Train Loss: 8.2560e+00 | Train MAE: 2.2349e+00 | Test Loss: 8.2015e+00 | Test MAE: 2.2272e+00 | Time Taken: 5m 36s > Checkpoint saved!
Epoch 3: Train Loss: 8.1515e+00 | Train MAE: 2.2204e+00 | Test Loss: 8.0986e+00 | Test MAE: 2.2129e+00 | Time Taken: 5m 40s > Checkpoint saved!
Epoch 4: Train Loss: 8.0497e+00 | Train MAE: 2.2062e+00 | Test Loss: 7.9987e+00 | Test MAE: 2.1989e+00 | Time Taken: 5m 38s > Checkpoint saved!
Epoch 5: Train Loss: 7.9512e+00 | Train MAE: 2.1922e+00 | Test Loss: 7.9016e+00 | Test MAE: 2.1850e+00 | Time Taken: 5m 24s > Checkpoint saved!
Epoch 6: Train Loss: 7.8553e+00 | Train MAE: 2.1785e+00 | Test Loss: 7.8072e+00 | Test MAE: 2.1715e+00 | Time Taken: 5m 24s > Checkpoint saved!
Epoch 7: Train Loss: 7.7623e+00 | Train MAE: 2.1651e+00 | Test Loss: 7.7155e+00 | Test MAE: 2.1581e+00 | Time Taken: 5m 25s > Checkpoint

In [11]:
wandb.finish()

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_loss,██▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁
test_mae,██▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▂▁▁
train_loss,██▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁
train_mae,██▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▂▁▁

0,1
epoch,20.0
test_loss,6.71955
test_mae,2.00338
train_loss,6.75208
train_mae,2.00878


## Get recommendation

In [12]:
movies = pd.read_csv("./database/merged/metadatas.csv")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
# model.load_state_dict(torch.load("models/cf/embeded_rating.pth"))
# model = EmbededRatingNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model = EmbededDotNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model.load_state_dict(torch.load("./model.pth"))

<All keys matched successfully>

In [13]:
def get_recommendation(user, model: nn.Module, top: int = 10):
    user_idx = int(mapper.user_fwd_map[user])
    users = torch.tensor([user_idx] * NUM_ITEM, dtype=torch.int64, device=DEVICE)
    movies = torch.tensor(range(NUM_ITEM), dtype=torch.int64, device=DEVICE)
    model.eval()
    with torch.no_grad():
        ratings: torch.Tensor = model(users, movies)
    ratings = [(mapper.item_inv_map[i], float(r.item())) for i, r in zip(range(NUM_ITEM), ratings.to("cpu"))]
    ratings = sorted(ratings, key=lambda x: x[1], reverse=True)[:top]
    return ratings

In [14]:
user_id = 102234
user_rated = data.dataset[data.dataset["UserID"] == user_id]
user_rated = pd.merge(movies[["MovieID", "Title"]], user_rated, on="MovieID", how="right")
user_rated = user_rated.sort_values(by="Rating", ascending=False)
user_rated = user_rated.reindex(columns=["Title", "MovieID", "Rating"])
user_rated.sample(10)

Unnamed: 0,Title,MovieID,Rating
128,Superman II (1980),2641,3.0
141,"Fistful of Dollars, A (Per un pugno di dollari...",2951,3.0
57,Star Trek II: The Wrath of Khan (1982),1374,4.0
44,"Godfather: Part II, The (1974)",1221,4.0
85,Lethal Weapon 3 (1992),2002,3.0
38,Raiders of the Lost Ark (Indiana Jones and the...,1198,5.0
50,Indiana Jones and the Last Crusade (1989),1291,4.0
104,Rambo: First Blood Part II (1985),2402,4.0
144,For Your Eyes Only (1981),2989,4.0
148,"Longest Day, The (1962)",3062,4.0


In [15]:
for m_id, rate in get_recommendation(user_id, model):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], rate)

155599 The Daughter (2016) 6.888847351074219
211077 Invasion Planet Earth (2019) 6.566414833068848
135613 Rabies (2010) 6.542375564575195
131808 The Postman's White Nights (2014) 6.458860874176025
851 Basquiat (1996) 6.401679992675781
41564 Kid & I, The (2005) 6.330195426940918
177269 Get the Girl (2017) 6.253493309020996
144530 Overheard (2009) 6.134182929992676
59895 Dust in the Wind (1987) 6.039362907409668
146958 Camera D'Albergo (1981) 6.036156177520752
