# Project 2. InfoExplorers.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset
import pickle
import random

In [2]:
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

## Data loading

In [3]:
class RatingsData(TensorDataset):
    """
    Class overriding TensorDataset for returning user and movie id along with corresponding rating
    """
    
    def __init__(self, root, train=True):
        self.n_users = 943
        self.n_items = 1682
        
        filename = '/kaggle/input/dis-project-2-recommender-systems/train_ratings.csv'

        df = pd.read_csv(filename)

        ratio = 0.7

        if train:
            df = df
        else:
            df = df.iloc[int(len(df) * ratio):]

        # Mapping for movie ids based on movies.csv dataset
        sorted_unique_ids = sorted(pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/movies.csv')['movieId'].unique())
        id_to_index_mapping = {id: idx for idx, id in enumerate(sorted_unique_ids)}
        
        # Replace values in the 'movieId' column with the corresponding indices
        df['movieId'] = df['movieId'].map(id_to_index_mapping)

        # Mapping for user ids
        sorted_unique_ids = sorted(df['userId'].unique())
        id_to_index_mapping = {id: idx for idx, id in enumerate(sorted_unique_ids)}
        
        # Replace values in the 'userId' column with the corresponding indices
        df['userId'] = df['userId'].map(id_to_index_mapping)
        
        # Convert user ids, item ids, and ratings to PyTorch tensors
        user_ids = torch.LongTensor(df.userId.values)
        item_ids = torch.LongTensor(df.movieId.values)
        ratings = torch.Tensor(df.rating.values)

        super(RatingsData, self).__init__(user_ids, item_ids, ratings)

In [4]:
df_movies = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/movies.csv')
df_links = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/links.csv')[['movieId', 'tmdbId', 'imdbId']]
df_movies = df_movies.merge(df_links, how='left', on='movieId')

# One-hot encoding for genre
df_movies = pd.concat([df_movies, df_movies.genres.str.get_dummies('|')], axis=1)
df_movies = df_movies.drop(columns=['(no genres listed)', 'genres'])

sorted_unique_ids = sorted(df_movies['movieId'].unique())

# Creating a mapping of IDs to indices based on the sorted order
id_to_index_mapping = {id: idx for idx, id in enumerate(sorted_unique_ids)}

# Replacing values in column movieId with the corresponding indices
df_movies['movieId'] = df_movies['movieId'].map(id_to_index_mapping)

df_movies.head()

Unnamed: 0,movieId,title,tmdbId,imdbId,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),862.0,114709,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Jumanji (1995),8844.0,113497,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Grumpier Old Men (1995),15602.0,113228,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,3,Waiting to Exhale (1995),31357.0,114885,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,4,Father of the Bride Part II (1995),11862.0,113041,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Extracting movie details by id

In [5]:
try:
    with open('/kaggle/input/project-2-precomputed/movie_info.pickle', 'rb') as handle:
        movie_info = pickle.load(handle)
except:
    movie_info = []
    for tmdb_id in tqdm(df_movies.tmdbId):
        try:
            movie_detail = movie.details(tmdb_id)
            movie_info.append([movie_detail['vote_average'], 
                               movie_detail['overview'], 
                               movie_detail['popularity']])
        except:
            movie_info.append([])
    with open('movie_info.pickle', 'wb') as handle:
        pickle.dump(movie_info, handle, protocol=pickle.HIGHEST_PROTOCOL)


## Processing movie information

In [6]:
movie_scores = []
movie_plot = []
movie_popularity = []

with open('/kaggle/input/project-2-precomputed/movie_plot.pickle', 'rb') as handle:
    movie_plot = pickle.load(handle)

for movie_detail in tqdm(movie_info):
    try:
        movie_scores.append(movie_detail[0])
        movie_popularity.append(movie_detail[2])
    except:
        movie_scores.append(np.NaN)
        movie_popularity.append(np.NaN)

  0%|          | 0/9742 [00:00<?, ?it/s]

In [7]:
df_movies['ratings'] = movie_scores
df_movies['popularity'] = movie_popularity

df_movies.head()

Unnamed: 0,movieId,title,tmdbId,imdbId,Action,Adventure,Animation,Children,Comedy,Crime,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings,popularity
0,0,Toy Story (1995),862.0,114709,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,7.97,120.836
1,1,Jumanji (1995),8844.0,113497,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,7.238,18.995
2,2,Grumpier Old Men (1995),15602.0,113228,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,6.494,19.23
3,3,Waiting to Exhale (1995),31357.0,114885,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,6.183,19.823
4,4,Father of the Bride Part II (1995),11862.0,113041,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,6.235,23.093


## Choosing a device

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'Cuda device: {torch.cuda.get_device_name(torch.cuda.current_device())}')

Using device: cuda
Cuda device: Tesla P100-PCIE-16GB


## Ratings dataset

We will train the recommender system on the dataset in which element consists of these values:
* `user_id` - id of the user (the smallest user id is 1)
* `item_id` - id of the movie (the smallest item id is 1)
* `rating` - rating given by the user to the movie
* `movie_plot` - embeddings of textual movie description
* `movie_features` - additional features of the movie including:
    * popularity
    * public rating
    * genre

The recommender system need to predict the rating for any given pair of `user_id` and `item_id`.

We measure the objective of the predicted ratings using the mean-squared error (MSE) loss

In [9]:
trainset = RatingsData(root=None, train=True)
testset = RatingsData(root=None, train=False)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=10, shuffle=False)

x = trainset[5]
print(f'user_id={x[0]}, item_id={x[1]}, rating={x[2]}')

user_id=599, item_id=1548, rating=3.5


## Our recommender system

In [10]:
class RecommenderSystem(nn.Module):
    """
    Neural network for movie recommender system class.
    """
    
    def __init__(self, n_users, n_items):
        """
        Args:
          n_users: Number of users.
          n_items: Number of items.
        """
        super(RecommenderSystem, self).__init__()
        self.embedding_user = nn.Embedding(n_users+1,10)
        self.embedding_item = nn.Embedding(n_items+1,10)
        self.user_linear = nn.Linear(10, 20)
        self.content_linear = nn.Linear(384, 20)
        self.layer_item = nn.Sequential(
                    nn.Linear(51, 60),
                    nn.Tanh(),
                    nn.Dropout(0.3),
                    nn.Linear(60, 10)
                    )
        
        self.layers = nn.Sequential(
                    nn.Linear(20, 40),
                    nn.Tanh(),
                    nn.Dropout(0.3),
                    nn.Linear(40, 10),
                    nn.Tanh(),
                    nn.Dropout(0.2),
                    nn.Linear(10,1)
                    )
                    
        self.attention = nn.MultiheadAttention(embed_dim=20, num_heads=1)
        
    def forward(self, user_ids, item_ids, item_feature, plot_feature):
        """
        Args:
          user_ids of shape (batch_size): User ids (starting from 1).
          item_ids of shape (batch_size): Item ids (starting from 1).
        
        Returns:
          outputs of shape (batch_size): Predictions of ratings.
        """
        emb_user = self.embedding_user(user_ids)
        emb_item = self.embedding_item(item_ids)
        

        user_output = F.relu(self.user_linear(emb_user))
        content_output = F.relu(self.content_linear(plot_feature))


        emb_plot, _ = self.attention(user_output, content_output, content_output)
        fea_item = self.layer_item(torch.concat([item_feature, emb_item, emb_plot],1))
        emb_concat = torch.concat([emb_user, fea_item],1)
        return self.layers(emb_concat).flatten()

## Training the model


In [11]:
movie_plot_arr = np.stack(movie_plot).astype('float32')

movie_features= df_movies.drop(columns=['movieId', 'title', 'tmdbId', 'imdbId']).to_numpy().astype('float32')

# min max scaling
movie_features = (movie_features- np.nanmin(movie_features,0)) / (np.nanmax(movie_features,0) - np.nanmin(movie_features,0))
movie_features[np.isnan(movie_features)] = 0

# convert to tensor
with torch.no_grad():
    item_features = torch.tensor(movie_features)
    plot_tensor = torch.tensor(movie_plot_arr)
item_features.shape

torch.Size([9742, 21])

In [12]:
model = RecommenderSystem(610, 9742)
model.to(device)

RecommenderSystem(
  (embedding_user): Embedding(611, 10)
  (embedding_item): Embedding(9743, 10)
  (user_linear): Linear(in_features=10, out_features=20, bias=True)
  (content_linear): Linear(in_features=384, out_features=20, bias=True)
  (layer_item): Sequential(
    (0): Linear(in_features=51, out_features=60, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=60, out_features=10, bias=True)
  )
  (layers): Sequential(
    (0): Linear(in_features=20, out_features=40, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=40, out_features=10, bias=True)
    (4): Tanh()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=10, out_features=1, bias=True)
  )
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=20, out_features=20, bias=True)
  )
)

In [13]:
def get_mse(a, crit, loader):
    mse_total = 0
    iters = 0
    a.eval()
    with torch.no_grad():
        for ser_ids, item_ids, rating in loader:
            item_fea = item_features[item_ids]
            plot_emb = plot_tensor[item_ids]
            prediction = a(ser_ids.to(device), item_ids.to(device), item_fea.to(device), plot_emb.to(device))
            mse_total += crit(prediction, rating.to(device)).item()
            iters+= 1
    return mse_total/iters

In [14]:
%%time

optim = torch.optim.Adam(model.parameters(), lr=0.0005,  weight_decay=0.001)
criterion = nn.MSELoss()
for i in range(30):
    model.train()
    for ser_ids, item_ids, rating in trainloader:
        optim.zero_grad()
        item_fea = item_features[item_ids]
        plot_emb = plot_tensor[item_ids]
        
        ser_ids = ser_ids.to(device)
        item_ids = item_ids.to(device)
        item_fea = item_fea.to(device)
        plot_emb = plot_emb.to(device)
        rating = rating.to(device)
        
        prediction = model(ser_ids, item_ids, item_fea, plot_emb)
        
        prediction = prediction.to(device)
        
        loss = criterion(prediction, rating)
        loss.backward()
        optim.step()
    print("Epoch", i)
    print("Training", get_mse(model, criterion, trainloader))
    

Epoch 0
Training 1.01279311415838
Epoch 1
Training 0.8317830290103044
Epoch 2
Training 0.7238096151972327
Epoch 3
Training 0.7005365376952905
Epoch 4
Training 0.671235517270269
Epoch 5
Training 0.6733648877196348
Epoch 6
Training 0.6665107038393705
Epoch 7
Training 0.6636461965423498
Epoch 8
Training 0.6563038687143757
Epoch 9
Training 0.6553885578837294
Epoch 10
Training 0.6540631399013749
Epoch 11
Training 0.6493217334216287
Epoch 12
Training 0.6527543984285568
Epoch 13
Training 0.6561883169358234
Epoch 14
Training 0.6455395755574893
Epoch 15
Training 0.645772067842527
Epoch 16
Training 0.6536338724453078
Epoch 17
Training 0.6445480351089155
Epoch 18
Training 0.6432231988274639
Epoch 19
Training 0.6481692409146546
Epoch 20
Training 0.6412285164926601
Epoch 21
Training 0.6444451751180794
Epoch 22
Training 0.6487432974392533
Epoch 23
Training 0.6530958561552559
Epoch 24
Training 0.642676012613432
Epoch 25
Training 0.6445797899326414
Epoch 26
Training 0.6378044583101398
Epoch 27
Trainin

## Testing the model

In [15]:
test_set = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/test_set_no_ratings.csv')
test_set['movieId'] = test_set['movieId'].map(id_to_index_mapping)
sorted_unique_users = sorted(pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/train_ratings.csv')['userId'].unique())

userid_to_index_mapping = {id: idx for idx, id in enumerate(sorted_unique_users)}
test_set['userId'] = test_set['userId'].map(userid_to_index_mapping)
test_set.head()

Unnamed: 0,Id,userId,movieId
0,0,431,7333
1,1,287,412
2,2,598,3222
3,3,41,2250
4,4,74,1211


In [16]:
model.eval()
test_users = torch.LongTensor(test_set.userId.to_numpy())
test_items = torch.LongTensor(test_set.movieId.to_numpy())
prediction = model(test_users.to(device),test_items.to(device), item_features[test_items].to(device), plot_tensor[test_items].to(device))
prediction

tensor([2.5970, 3.3230, 2.5361,  ..., 3.8921, 3.4670, 3.1534], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)

In [17]:
test_set['rating'] = prediction.cpu().detach().numpy()
test_set.head()

Unnamed: 0,Id,userId,movieId,rating
0,0,431,7333,2.597031
1,1,287,412,3.323043
2,2,598,3222,2.536095
3,3,41,2250,3.78812
4,4,74,1211,3.372283


## Creating submission

In [18]:
submission= test_set.drop(columns=['userId', 'movieId'])
submission.head()

Unnamed: 0,Id,rating
0,0,2.597031
1,1,3.323043
2,2,2.536095
3,3,3.78812
4,4,3.372283


In [19]:
submission.to_csv('submission.csv', index=False)