In [None]:
import pandas as pd
import numpy as numpy
from sklearn import model_selection, metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Prepare data

In [None]:
df = pd.read_csv("./data/MovieLens/ratings.csv")
df.info()

In [None]:
df.userId.nunique(), df.movieId.nunique()

In [None]:
df.rating.value_counts()

In [None]:
df.shape

In [None]:
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]
        
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "movies": torch.tensor(movies, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

In [None]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)
        self.out = nn.Linear(64, 1)
        
    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        return self.out(torch.cat([user_embeds, movie_embeds], dim=1))

In [None]:
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

df_train, df_test = train_test_split(
    df, test_size=0.1, random_state=42, stratify=df.rating.values
)

print(df_train.userId.values, df_train.movieId.values, df_train.rating.values)

train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values
)

test_dataset = MovieDataset(
    users=df_test.userId.values,
    movies=df_test.movieId.values,
    ratings=df_test.rating.values
)

In [None]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=4,
    shuffle=True
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=4,
    shuffle=True
)

dataloader_data = next(iter(train_loader))
dataloader_data

## Build model

In [None]:

model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_)
).to(device)

optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
loss_fn = nn.MSELoss()

In [None]:
print(len(lbl_user.classes_))
print(len(lbl_movie.classes_))
print(df.movieId.max())
print(len(train_dataset)) 

## Training

Manually run a forward path 

In [None]:
print(dataloader_data['users'])
print(dataloader_data['users'].size())
print(dataloader_data['movies'])
print(dataloader_data['movies'].size())

user_embed = nn.Embedding(len(lbl_user.classes_), 32)
movie_embed = nn.Embedding(len(lbl_movie.classes_), 32)

out = nn.Linear(64, 1)

In [None]:
user_embeds = user_embed(dataloader_data['users'])
movie_embeds = movie_embed(dataloader_data['movies'])

print(f'user_embeds {user_embeds.size()}')
print(f'user_embeds {user_embeds}')
print(f'movie_embeds {movie_embeds.size()}')
print(f'movie_embeds {movie_embeds}')

In [None]:
output = torch.cat([user_embeds, movie_embeds], dim=1)
print(f'output: {output.size()}')
print(f'output: {output}')
output = out(output)
print(f'output: {output}')

In [None]:
with torch.inference_mode():
    model_output = model(dataloader_data['users'].to(device), dataloader_data['movies'].to(device))
    print(f'model_output: {model_output}, size: {model_output.size()}')

In [None]:
rating = dataloader_data['ratings']

print(rating)
print(rating.view(4, -1))
print(model_output)

print(rating.sum())
print(model_output.sum() - rating.sum())

Training Loop

In [None]:
epochs = 3
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = []

model.train()
for epoch in range(epochs):
    for i, train_data in enumerate(train_loader):
        train_data["users"] = train_data["users"].to(device)
        train_data["movies"] = train_data["movies"].to(device)
        train_data["ratings"] = train_data["ratings"].to(device)
        output = model(train_data["users"], train_data["movies"])
        
        rating = train_data["ratings"].view(4, -1).to(torch.float32)
        
        loss = loss_fn(output, rating)
        total_loss += loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        step_cnt += len(train_data["users"])
        if (step_cnt % plot_steps == 0):
            avg_loss = total_loss / (len(train_data["users"]) * plot_steps)
            print(f'epoch {epoch} loss at step: {step_cnt} is {avg_loss}')
            all_losses_list.append(avg_loss)
            total_loss = 0

In [None]:
torch.save(obj=model.state_dict(), f="models/movielens_model.pth")

In [None]:
model.load_state_dict(torch.load(f="models/movielens_model.pth"))

In [None]:
plt.figure()
plt.plot(all_losses_list)
plt.show()

In [None]:
model_output_list = []
target_rating_list = []

model.eval()

with torch.inference_mode():
    for i, test_data in enumerate(test_loader):
        users = test_data['users'].to(device)
        movies = test_data['movies'].to(device)
        ratings = test_data['ratings'].to(device)
        
        model_output = model(users, movies)
        
        model_output_list.append(model_output.sum().item() / len(users))
        target_rating_list.append(ratings.sum().item() / len(users))
        
        print(f'Model output: {model_output}, target rating: {ratings}')
        
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")

In [None]:
user_est_true = defaultdict(list)

with torch.inference_mode():
    for i, test_data in enumerate(test_loader):
        users = test_data['users'].to(device)
        movies = test_data['movies'].to(device)
        ratings = test_data['ratings'].to(device)
        
        model_output = model(users, movies)
        
        for i in range(len(users)):
            user_id = users[i].item()
            movie_id = movies[i].item()
            pred_rating = model_output[i][0].item()
            true_rating = ratings[i].item()
            
            print(f'User {user_id} and Movie {movie_id}:')
            print(f'Prediction: {pred_rating}, True: {true_rating}')
            user_est_true[user_id].append((pred_rating, true_rating))
        
        

In [None]:
with torch.inference_mode():
    precisions = dict()
    recalls = dict()
    
    k = 100
    threshold = 3.5
    
    for user_id, user_ratings in user_est_true.items():
        
        # sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # get the number of actual relevant item
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # get the number of recommended items that are predicted relevant within topk
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # get the number of recommended items that are actually relevant within topk
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        print(f'User {user_id}, n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}')
        
        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        

In [None]:
print(f'precision at {k}: {sum(prec for prec in precisions.values()) / len(precisions)}')
print(f'recall @ {k}: {sum(rec for rec in recalls.values()) / len(recalls)}')