In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import time
import wandb
import pandas as pd

from models.cf import EmbededRatingNet, EmbededDotNet
from models.custom import RatingDataset
from models.training import train_model, test_model
from utility import Mapper

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)

Using cuda


In [17]:
wandb.login()

True

## Dataset

In [18]:
DATASET = "merged/full"
mapper = Mapper.load(f"./database/{DATASET}/pydata/mapper.pkl")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
data = RatingDataset(f"./database/{DATASET}", full_mapper, True)
print("Number of users:", NUM_USER := len(mapper.user_fwd_map))
print("Number of items:", NUM_ITEM := len(mapper.item_fwd_map))

Number of users: 200948
Number of items: 84432


## Deep Model Training

In [19]:
# Model parameters
NUM_FACTOR = 12
# Training parameters
EPOCHS = 20
BATCH = 4096
SEED = 291124
LR = 1.0

In [20]:
wandb.init(
    project="movie-recommendation-models",
    resume="allow",
    config={
        "dataset": DATASET,
        "seed": SEED,
        "batch_size": BATCH,
        "epochs": EPOCHS,
        "num_factor": NUM_FACTOR,
        "device": DEVICE,
        "learning_rate": LR,
    },
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112730422205964, max=1.0…

Train-Test Split

In [21]:
train, test = data.split(0.8, seed=SEED)

In [22]:
train_loader = DataLoader(train, batch_size=BATCH, shuffle=True, num_workers=3)
test_loader = DataLoader(test, batch_size=BATCH, num_workers=3)

### CF Embeded Rating

In [23]:
# model = EmbededRatingNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model = EmbededDotNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
loss_fn = nn.MSELoss()
l1 = nn.L1Loss()
optimizer = optim.SGD(model.parameters(), lr=LR)
best_val_loss = float("inf")
wandb.watch(model)

Model training

In [24]:
def logging(epoch, train_loss, train_mae, test_loss, test_mae, time):
    time  = int(time)
    logs = ""
    logs += f"Epoch {epoch}: "
    logs += f"Train Loss: {train_loss:.4e} | "
    logs += f"Train MAE: {train_mae:.4e} | "
    logs += f"Test Loss: {test_loss:.4e} | "
    logs += f"Test MAE: {test_mae:.4e} | "
    logs += f"Time Taken: {time // 60}m {time % 60:02d}s"
    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_mae": train_mae,
        "test_loss": test_loss,
        "test_mae": test_mae,
    })
    print(logs, end=" ")

In [25]:
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_mae = train_model(epoch, model, train_loader, loss_fn, optimizer, DEVICE)
    test_loss, test_mae = test_model(epoch, model, test_loader, loss_fn, DEVICE)
    time_taken = time.time() - start_time
    logging(epoch, train_loss, train_mae, test_loss, test_mae, time_taken)
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model.state_dict(), "model.pth")
        print("> Checkpoint saved!")
    else:
        print()

Epoch 1: Train Loss: 3.2025e+00 | Train MAE: 1.2714e+00 | Test Loss: 1.6514e+00 | Test MAE: 9.6372e-01 | Time Taken: 6m 35s > Checkpoint saved!
Epoch 2: Train Loss: 1.3415e+00 | Train MAE: 8.9608e-01 | Test Loss: 1.1714e+00 | Test MAE: 8.5646e-01 | Time Taken: 7m 01s > Checkpoint saved!
Epoch 3: Train Loss: 1.0438e+00 | Train MAE: 8.2817e-01 | Test Loss: 9.8987e-01 | Test MAE: 8.1426e-01 | Time Taken: 6m 12s > Checkpoint saved!
Epoch 4: Train Loss: 9.0965e-01 | Train MAE: 7.9666e-01 | Test Loss: 8.9364e-01 | Test MAE: 7.9150e-01 | Time Taken: 5m 56s > Checkpoint saved!
Epoch 5: Train Loss: 8.3258e-01 | Train MAE: 7.7824e-01 | Test Loss: 8.3377e-01 | Test MAE: 7.7723e-01 | Time Taken: 7m 01s > Checkpoint saved!
Epoch 6: Train Loss: 7.8214e-01 | Train MAE: 7.6606e-01 | Test Loss: 7.9282e-01 | Test MAE: 7.6739e-01 | Time Taken: 7m 56s > Checkpoint saved!
Epoch 7: Train Loss: 7.4645e-01 | Train MAE: 7.5734e-01 | Test Loss: 7.6300e-01 | Test MAE: 7.6014e-01 | Time Taken: 8m 45s > Checkpoint

In [26]:
wandb.finish()

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_loss,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
test_mae,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
test_loss,0.63702
test_mae,0.72559
train_loss,0.60505
train_mae,0.71858


## Get recommendation

In [27]:
movies = pd.read_csv("./database/merged/metadatas.csv")
full_mapper = Mapper.load(f"./database/merged/mapper.pkl")
# model.load_state_dict(torch.load("models/cf/embeded_rating.pth"))
# model = EmbededRatingNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model = EmbededDotNet(NUM_USER, NUM_ITEM, NUM_FACTOR).to(DEVICE)
model.load_state_dict(torch.load("./model.pth"))

<All keys matched successfully>

In [28]:
def get_recommendation(user, model: nn.Module, top: int = 10):
    user_idx = int(mapper.user_fwd_map[user])
    users = torch.tensor([user_idx] * NUM_ITEM, dtype=torch.int64, device=DEVICE)
    movies = torch.tensor(range(NUM_ITEM), dtype=torch.int64, device=DEVICE)
    model.eval()
    with torch.no_grad():
        ratings: torch.Tensor = model(users, movies)
    ratings = [(mapper.item_inv_map[i], float(r.item())) for i, r in zip(range(NUM_ITEM), ratings.to("cpu"))]
    ratings = sorted(ratings, key=lambda x: x[1], reverse=True)[:top]
    return ratings

In [29]:
user_id = 102234
user_rated = data.dataset[data.dataset["UserID"] == user_id]
user_rated = pd.merge(movies[["MovieID", "Title"]], user_rated, on="MovieID", how="right")
user_rated = user_rated.sort_values(by="Rating", ascending=False)
user_rated = user_rated.reindex(columns=["Title", "MovieID", "Rating"])
user_rated.sample(10)

Unnamed: 0,Title,MovieID,Rating
86,"Mask of Zorro, The (1998)",2006,4.0
94,Willow (1988),2193,3.0
37,"Princess Bride, The (1987)",1197,4.0
3,Waterworld (1995),208,4.0
36,Star Wars: Episode V - The Empire Strikes Back...,1196,4.0
78,Deep Impact (1998),1876,4.0
69,Conan the Barbarian (1982),1587,3.0
133,Allan Quatermain and the Lost City of Gold (1987),2748,2.0
31,"Ghost and the Darkness, The (1996)",1049,4.0
19,Mission: Impossible (1996),648,4.0


In [30]:
for m_id, rate in get_recommendation(user_id, model):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], rate)

261715 Help (Movie) (2021) 10.27344799041748
270926 Reclaiming Amy (2021) 9.936103820800781
261083 Dune World (2021) 9.629478454589844
221728 Into The Void (2019) 9.330317497253418
244512 The Kings of Appletown (2008) 9.250347137451172
276975 Final Move (2006) 9.208444595336914
122619 Nervous Ticks (1992) 9.187948226928711
250376 Subject to Review (2019) 9.155195236206055
194933 Funeralopolis : A Suburban Portrait (2017) 8.91994857788086
164073 La macchinazione (2016) 8.904389381408691
