In [None]:
import pandas as pd
import numpy as numpy
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Prepare data

In [None]:
df = pd.read_csv("./data/MovieLens/ratings.csv")
df.info()

In [None]:
df.userId.nunique(), df.movieId.nunique()

In [None]:
df.rating.value_counts()

In [None]:
df.shape

In [None]:
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]
        
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "movies": torch.tensor(movies, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

In [None]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)
        self.out = nn.Linear(64, 1)
        
    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        return self.out(torch.cat([user_embeds, movie_embeds], dim=1))

In [None]:
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

BATCH_SIZE = 28
NUM_SPLITS = 5

dataset = MovieDataset(
    users=df.userId.values,
    movies=df.movieId.values,
    ratings=df.rating.values
)
kfold = KFold(n_splits=NUM_SPLITS, shuffle=True)

train_dataloaders = []
test_dataloaders = []

for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    train_subsampler = SubsetRandomSampler(train_ids)
    test_subsampler = SubsetRandomSampler(test_ids)
    
    train_dataloaders.append(DataLoader(dataset, batch_size=BATCH_SIZE, sampler=train_subsampler, drop_last=True))
    test_dataloaders.append(DataLoader(dataset, batch_size=BATCH_SIZE, sampler=test_subsampler, drop_last=True))
    

next(iter(train_dataloaders[0]))

## Build model

In [None]:
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.8)
loss_fn = nn.MSELoss()

## Training

In [None]:
epochs = 100
train_loss_values = []

model.train()
for epoch in range(epochs):
    avg_train_loss = 0
    for i, train_data in enumerate(train_dataloaders[epoch % NUM_SPLITS]):
        users = train_data["users"].to(device)
        movies = train_data["movies"].to(device)
        ratings = train_data["ratings"].to(device)
        
        output = model(users, movies)
        
        rating = ratings.view(28, -1).to(torch.float32)
        
        loss = loss_fn(output, rating)
        avg_train_loss += loss.detach().cpu().numpy()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    scheduler.step()
    avg_train_loss /= len(train_dataloaders[0])
    print(f'Epoch {epoch}: {avg_train_loss}')
    train_loss_values.append(avg_train_loss)

Save model

In [None]:
torch.save(obj=model.state_dict(), f="models/movielens_model.pth")

In [None]:
model.load_state_dict(torch.load(f="models/movielens_model.pth"))

In [None]:
plt.figure()
plt.plot(train_loss_values)
plt.show()

# Validation

In [None]:
model_output_list = []
target_rating_list = []

model.eval()
correct, total = 0, 0
with torch.inference_mode():
    for i, test_data in enumerate(test_dataloaders[0]):
        users = test_data['users'].to(device)
        movies = test_data['movies'].to(device)
        ratings = test_data['ratings'].to(device)
        
        preds = model(users, movies)
        
        for i in range(BATCH_SIZE):
            if round(preds[i].item()) == ratings[i]:
                correct += 1
            total += 1
        
        model_output_list.append(preds.sum().item() / len(users))
        target_rating_list.append(ratings.sum().item() / len(users))
        
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")
print(f'Accuracy: {round(correct / total * 100, 2)}')

In [None]:
user_est_true = defaultdict(list)

with torch.inference_mode():
    for i, test_data in enumerate(test_dataloaders[0]):
        users = test_data['users'].to(device)
        movies = test_data['movies'].to(device)
        ratings = test_data['ratings'].to(device)
        
        model_output = model(users, movies)
        
        for i in range(len(users)):
            user_id = users[i].item()
            movie_id = movies[i].item()
            pred_rating = round(model_output[i][0].item())
            true_rating = ratings[i].item()
            
            print(f'User {user_id} and Movie {movie_id}:')
            print(f'Prediction: {pred_rating}, True: {true_rating}')
            user_est_true[user_id].append((pred_rating, true_rating))
        
        

In [None]:
with torch.inference_mode():
    precisions = dict()
    recalls = dict()
    
    k = 100
    threshold = 3.5
    
    for user_id, user_ratings in user_est_true.items():
        
        # sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # get the number of actual relevant item
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # get the number of recommended items that are predicted relevant within topk
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # get the number of recommended items that are actually relevant within topk
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        print(f'User {user_id}, n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}')
        
        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        

In [None]:
print(f'precision at {k}: {sum(prec for prec in precisions.values()) / len(precisions)}')
print(f'recall @ {k}: {sum(rec for rec in recalls.values()) / len(recalls)}')