In [9]:
!conda activate filippo
import os
import pandas as pd

import utils
utils.print_vram_info()

import torch
print("Cuda version", torch.version.cuda)


CondaError: Run 'conda init' before 'conda activate'

Total VRAM: 31.25 GB
Available VRAM: 29.21 GB
Used VRAM: 1.54 GB
Percentage Used: 6.5%
Torch version 2.7.1+cu128
Cuda version 12.8


In [2]:
from pathlib import Path
ratings_dir = Path("crawled/ratings/")

df_all = pd.read_csv(ratings_dir / "special.csv")

csv_files = [f for f in ratings_dir.glob("*.csv") if f.stem.isdigit()]
num_csv = 2

df_all = pd.concat(
    [df_all]
    +
    [ pd.read_csv(f) for f in csv_files[:num_csv]],
    ignore_index=True
)

len(df_all)

379832

In [3]:
print("Total ratings size: ", len(df_all))

movie_popularity = df_all["movie_id"].value_counts()
popular_movies = movie_popularity[movie_popularity >= 30].index
df = df_all[df_all["movie_id"].isin(popular_movies)]
print("Ratings size after movie filtering: ", len(df))

unique_users = df["user_id"].unique()
sampled_users = unique_users[0:100]
df = df[df["user_id"].isin(sampled_users)]
print("Ratings size after user filtering: ", len(df))

Total ratings size:  379832
Ratings size after movie filtering:  147188
Ratings size after user filtering:  72042


In [4]:
from fastai.data.block import DataBlock, CategoryBlock, RegressionBlock
from fastai.data.transforms import ColReader
from fastai.collab import CollabDataLoaders
import numpy as np

df['movie_popularity'] = df['movie_id'].map(np.log1p(df['movie_id'].value_counts()))

# TODO: it may happen that a split cause a user or movie to be only in the validation set.
# if this happens, a crash occur, so we may need to have categorical data that allow for uncategorized data.
# Or, have a cold-split of users.
dblock = DataBlock(
      blocks=(CategoryBlock, CategoryBlock, RegressionBlock, RegressionBlock),
      getters=[ColReader('user_id'), ColReader('movie_id'), ColReader('movie_popularity'), ColReader('rating')],
      n_inp=3
)
dls = dblock.dataloaders(df, bs=64)

In [5]:
from fastai.learner import Module
import torch.nn as nn
import torch
import sys

def sigmoid_range(x, y_range):
    low, high = y_range
    return torch.sigmoid(x) * (high - low) + low

def create_random_parameter(sizes):
    return nn.Parameter(torch.zeros(*sizes).normal_(0, 0.01))
                        
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
        self.user_factors=create_random_parameter([n_users, n_factors])
        self.movie_factors=create_random_parameter([n_movies, n_factors])
        self.user_bias=create_random_parameter([n_users])
        self.movie_bias=create_random_parameter([n_movies])
        self.y_range=y_range
        
    def forward(self, user_indices, movie_indices, movie_popularity):
        user_factors = self.user_factors[user_indices]
        movie_factors = self.movie_factors[movie_indices]
        user_bias = self.user_bias[user_indices]
        movie_bias = self.movie_bias[movie_indices]
        res = (user_factors*movie_factors).sum(dim=1) + user_bias + movie_bias - 0.1 * movie_popularity

        return sigmoid_range(res, self.y_range)

In [6]:
from fastai.learner import Learner
from fastai.learner import MSELossFlat

n_users = len(dls.items["user_id"].unique())
n_movies = len(dls.items["movie_id"].unique())
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [7]:
learn.fit_one_cycle(2, lr_max=0.003, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.627797,0.635843,00:04
1,0.566519,0.577532,00:04


In [8]:
user_vocab = dls.train_ds.vocab[0]
user2idx = {v: i for i, v in enumerate(user_vocab)}
movie_vocab = dls.train_ds.vocab[1]
movie2idx = {v: i for i, v in enumerate(movie_vocab)}

def predict_ratings(user_name):
  user_id = user2idx[user_name]
  watched_movies = df[df['user_id'] == user_name]['movie_id'].astype(str)
  movie_pop_map = df.drop_duplicates('movie_id').set_index('movie_id')['movie_popularity'].to_dict()
  predicted_movies = [(movie, movie_pop_map[movie]) for movie in movie_vocab if movie not in set(watched_movies)]
    
  device = next(model.parameters()).device
  user_id = torch.tensor([user_id], device=device)

  prediction_list = []
  for movie_name, movie_popularity in predicted_movies:
    movie_id = movie2idx[movie_name]
    movie_id = torch.tensor([movie_id], device=device)
    movie_popularity = torch.tensor([movie_popularity], dtype=torch.float32, device=device)

    rating = model.forward(user_id, movie_id, movie_popularity)
    prediction_list += [(movie_name, rating.item(), movie_popularity.item())]

  prediction_list = sorted(prediction_list, key=lambda x: x[1], reverse=True)
  return prediction_list[0:100]

predict_ratings("drakman")

[('a-brighter-summer-day', 4.3573317527771, 2.7725887298583984),
 ('psycho', 4.355795383453369, 4.127134323120117),
 ('some-like-it-hot', 4.346652507781982, 3.4011974334716797),
 ('twin-peaks', 4.335939407348633, 3.0910425186157227),
 ('come-and-see', 4.303749084472656, 3.465735912322998),
 ('once-upon-a-time-in-the-west', 4.302332878112793, 3.2188758850097656),
 ('twin-peaks-the-return', 4.295563697814941, 3.0910425186157227),
 ('taste-of-cherry', 4.289790630340576, 2.995732307434082),
 ('all-about-my-mother', 4.284285068511963, 2.7725887298583984),
 ('casablanca', 4.238131046295166, 3.6635615825653076),
 ('the-battle-of-algiers', 4.233630657196045, 2.890371799468994),
 ('a-separation', 4.232206344604492, 2.7725887298583984),
 ('one-flew-over-the-cuckoos-nest', 4.23062801361084, 3.4011974334716797),
 ('before-sunset', 4.215855598449707, 3.6375861167907715),
 ('the-apartment', 4.2122907638549805, 3.5835189819335938),
 ('lawrence-of-arabia', 4.20846700668335, 3.178053855895996),
 ('sing

In [9]:
learn.save('movie_50+_user_500_embeddings_100.pkl')

Path('models/movie_50+_user_500_embeddings_100.pkl.pth')