In [69]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn import preprocessing
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import random
from tabulate import tabulate
from tqdm import tqdm

In [70]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [71]:
data = pd.read_csv('data/test_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,timestamp
0,62812,888,487,2,880180265
1,8041,266,6,5,878970503
2,37057,534,150,4,879618338
3,65941,756,750,3,888443398
4,32169,221,1010,4,881061049


In [72]:
class RecommendationModel(nn.Module):
    def __init__(self, n_users, n_movies):
        """
        Args:
            n_users (int): number of unique users
            n_movies (int): number of unique movies
        """
        super().__init__()

        self.user_embed = nn.Embedding(n_users, 96)
        self.movie_embed = nn.Embedding(n_movies, 64)

        self.fc1 = nn.Linear(160, 32)
        self.drop1 = nn.Dropout(p=0.5)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(32, 32)
        self.drop2 = nn.Dropout(p=0.5)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)


    def forward(self, users, movies):
        """
        Forward function

        Args:
            users (torch.Tensor): ids of users
            movies (torch.Tensor): ids of movies

        Returns:
            float: rating of the input movie
        """

        # Embedding of user id
        user_id_embeds = self.user_embed(users)
        # Embedding of movie id
        movie_id_embeds = self.movie_embed(movies)

        x = torch.cat([user_id_embeds, movie_id_embeds], dim=1)

        x = self.fc1(x)
        x = self.drop1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        x = self.relu2(x)
        x = self.fc3(x)

        return x

In [73]:
class MovieDataset(Dataset):
    """
    Custom Movie Dataset. 
    """
    def __init__(self, user_ids, movie_ids, ratings):
        """
        Args:
            user_ids: ids of users
            movie_ids: ids of movies
            ratings: ratings of the corresponding movies
        """
        self.user_ids = user_ids
        self.movie_ids = movie_ids
        self.ratings = ratings

    def __len__(self):
        """
        Returns length of the dataset

        Returns:
            _type_: int
        """
        return len(self.user_ids)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): index of the required item

        Returns:
            _type_: dict
            user_ids: ids of the users
            movie_ids: ids of the movies
            ratings: ratings of the corresponding movies
        """
        sample = {
            "user_ids": torch.tensor(self.user_ids[idx], dtype=torch.long),
            "movie_ids": torch.tensor(self.movie_ids[idx], dtype=torch.long),
            "ratings": torch.tensor(self.ratings[idx], dtype=torch.float32)
        }
        return sample

In [74]:
test_dataset = MovieDataset(
    user_ids=data.user_id.values,
    movie_ids=data.movie_id.values,
    ratings=data.rating.values,
)

In [75]:
test_batch_size = 16

In [76]:
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=test_batch_size,
    shuffle=False,
    num_workers=2,
    drop_last=True
)

In [77]:
model = RecommendationModel(
    n_users=943,
    n_movies=1682,
).to(device)
model_path = '../models/best_model.pth'
model.load_state_dict(torch.load(model_path))

loss_func = nn.MSELoss()

In [78]:
def test_model(model, test_loader, loss_func):
    model.eval()
    test_running_loss = 0
    with torch.no_grad():
        for test_data in tqdm(test_loader):
            output = model(
                test_data["user_ids"].to(device),
                test_data["movie_ids"].to(device)
                        )
            rating = test_data["ratings"].view(test_batch_size, -1).to(torch.float32).to(device)
            loss = torch.sqrt(loss_func(output, rating))
            test_running_loss += loss.sum().item()
    print(f"Test loss: {test_running_loss/len(test_loader)}")

In [79]:
test_model(model, test_loader, loss_func)

100%|██████████| 1250/1250 [00:03<00:00, 412.02it/s]

Test loss: 0.9468653615474701





In [80]:
from collections import defaultdict

def get_rating_pairs(valid_loader, model):
    rating_pairs = defaultdict(list)

    with torch.no_grad():
        for data in valid_loader:
            user_ids = data['user_ids']
            movie_ids = data['movie_ids']
            ratings = data['ratings']

            output = model(
                user_ids.to(device), 
                movie_ids.to(device)
            )

            for i in range(len(user_ids)):
                user_id = user_ids[i].item()
                movie_id = user_ids[i].item()
                pred_rating = output[i][0].item()
                true_rating = ratings[i].item()

                rating_pair = {
                    'movie_id': movie_id,
                    "predicted_rating": pred_rating,
                    "true_rating": true_rating
                }
                rating_pairs[user_id].append(rating_pair)
    return rating_pairs

In [81]:
rating_pairs = get_rating_pairs(test_loader, model)

In [82]:
def calculate_metrics(rating_pairs, k=10, threshold=3.5):
    precisions = {}
    recalls = {}
    for user_id, rating in rating_pairs.items():

        # Choosing the best movies based on predicted rating
        rating.sort(key=lambda x: x['predicted_rating'], reverse=True)

        # Let's calculate total number of relevant items
        
        total_relevant = sum(x['true_rating'] >= threshold for x in rating)

        # Number of top@k predicted relevant items
        n_rec_k = sum(x['predicted_rating'] >= threshold for x in rating[:k])

        # Number of recommended item @k that are relevant
        n_rel_and_rec_k = sum(
            ((x['true_rating'] >= threshold) and (x['predicted_rating'] >= threshold))
            for x in rating[:k]
        )

        precisions[user_id] = 0
        recalls[user_id] = 0
        if n_rec_k != 0:
            precisions[user_id] = n_rel_and_rec_k / n_rec_k

        if total_relevant != 0:
            recalls[user_id] = n_rel_and_rec_k / total_relevant
                
    print(f"precision @ {k}: {sum(prec for prec in precisions.values()) / len(precisions)}")
    print(f"recall @ {k} : {sum(rec for rec in recalls.values()) / len(recalls)}")
    return precisions, recalls

In [83]:
precisions, recalls = calculate_metrics(rating_pairs)

precision @ 10: 0.6845010615711244
recall @ 10 : 0.5097653159096427
