# Recommendation model 

* Write summary here *

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime

In [5]:
# Read csv file
def read_data(path, includeDates=False):
    
    if includeDates:
        data = pd.read_csv(path) 
                          #parse_dates=['timestamp'])
    else:
        data = pd.read_csv(path)
    return data

Reading the csv files and checking out the data

In [7]:
raiting_data = read_data('data/ratings.csv', True)
movie_data = read_data('data/movies.csv')
tags_data = read_data('data/tags.csv')
links_data = read_data('data/links.csv')
display(raiting_data.head(5))
display(movie_data.head(5))
display(tags_data.head(5))
display(links_data.head(5))

FileNotFoundError: [Errno 2] File data/ratings.csv does not exist: 'data/ratings.csv'

Checking out the raiting data and movie csv further

In [133]:
# Checking out data types
print(raiting_data.dtypes)
print()
print(movie_data.dtypes)

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

movieId     int64
title      object
genres     object
dtype: object


In [134]:
# Checking out number of instances
print(raiting_data.shape)
print(movie_data.shape)

(7570586, 4)
(58098, 3)


In [135]:
# Checking out number of unique movies
# Output interpretation: Around 20k of the movies have not been reviewed. 
print(len(raiting_data['movieId'].unique()))
print(len(movie_data['movieId'].unique()))

38230
58098


In [136]:
# Checking out number of unique users who have rated
len(raiting_data['userId'].unique())

77932

### Re-sampling the dataset
Takes 25% of the training data by selecting 25% of the unique users randomly

In [137]:
np.random.seed(42)
random_userIds = np.random.choice(raiting_data['userId'].unique(), 
                                size=int(len(raiting_data['userId'].unique())*0.25), 
                                replace=False)
print("Reduced length of users: ", len(random_userIds))
raiting_data_new = raiting_data.loc[raiting_data['userId'].isin(random_userIds)]
print("Reduced size of dataset: ", raiting_data_new.shape)

Reduced length of users:  19483
Reduced size of dataset:  (1909347, 4)


In [138]:
# Displaying differences in the new dataset. 
display(raiting_data_new.head(10))

Unnamed: 0,userId,movieId,rating,timestamp
778,5,47,4.0,1209126049
779,5,50,5.0,1209039937
780,5,293,3.5,1209040005
781,5,296,5.0,1209040010
782,5,318,5.0,1209040104
783,5,527,4.5,1209126022
784,5,728,3.0,1209039379
785,5,778,5.0,1209040034
786,5,858,4.5,1209125953
787,5,1147,4.5,1209039420


## Leave one out split
The timestamp column will be used for leave one out. The most recen review is used as the test set. This will eliminate the risk of training on the most recent review and test on earlier reviews. If this would be the case, we would hence make incorrect predicitons since the model would be trained on non-chronological data. 

In [139]:
# Ignore warning
pd.options.mode.chained_assignment = None  # default='warn'


# Convert string timestamp to date time format
raiting_data_new['timestamp'] = [datetime.fromtimestamp(int(x)) for x in raiting_data_new['timestamp']]

# Rank 
raiting_data_new['rank_latest'] = raiting_data_new.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
print(raiting_data_new.head(10))
print(raiting_data_new.shape)

train_ratings = raiting_data_new[raiting_data_new['rank_latest'] != 1.0]
test_ratings = raiting_data_new[raiting_data_new['rank_latest'] == 1.0]

     userId  movieId  rating           timestamp  rank_latest
778       5       47     4.0 2008-04-25 14:20:49          2.0
779       5       50     5.0 2008-04-24 14:25:37         43.0
780       5      293     3.5 2008-04-24 14:26:45         33.0
781       5      296     5.0 2008-04-24 14:26:50         32.0
782       5      318     5.0 2008-04-24 14:28:24         17.0
783       5      527     4.5 2008-04-25 14:20:22          3.0
784       5      728     3.0 2008-04-24 14:16:19         70.0
785       5      778     5.0 2008-04-24 14:27:14         27.0
786       5      858     4.5 2008-04-25 14:19:13          6.0
787       5     1147     4.5 2008-04-24 14:17:00         66.0
(1909347, 5)


In [145]:
print(train_ratings.shape)
print(test_ratings.shape)
display(train_ratings.head(4))
display(test_ratings.head(4))

(1889864, 5)
(19483, 5)


Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
778,5,47,4.0,2008-04-25 14:20:49,2.0
779,5,50,5.0,2008-04-24 14:25:37,43.0
780,5,293,3.5,2008-04-24 14:26:45,33.0
781,5,296,5.0,2008-04-24 14:26:50,32.0


Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
810,5,5995,5.0,2008-04-25 14:21:33,1.0
903,7,4903,3.5,2009-03-31 12:00:34,1.0
1101,13,838,3.5,2014-10-21 16:34:53,1.0
2191,33,737,1.0,2007-08-14 08:48:02,1.0


In [147]:
train_ratings.drop(columns=['timestamp', 'rank_latest'], inplace=True)
test_ratings.drop(columns=['timestamp', 'rank_latest'], inplace=True)

## Implicit feedback

Raiting a movie is considered explicit feedback. That is, the users have interacted and explicity stated their feedback. Typically, explicit feedback is quanittative (e.g. raiting 1-5)

Implicit feedback is the opposite: it is collected indirectly from user interaction - i.e. in this context, figuring out how / predicting whether the user will interact with a movie. 

By binarizing the dataset (0 for no interaction and 1 for interaction) we will convert the explicit feedback to implicit feedback. 

Afterwards, 4 negative samples are randomly generated for each user that represents movies that a user has not interacted with.  

In [193]:
train_ratings.loc[:, 'rating'] = 1
print(train_ratings)
# Get a list of all movie IDs
all_movieIds = raiting_data_new['movieId'].unique()
print(len(all_movieIds))
# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))
# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in user_item_set:
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)

        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

         userId  movieId  rating
778           5       47       1
779           5       50       1
780           5      293       1
781           5      296       1
782           5      318       1
...         ...      ...     ...
7570570   77930     2616       1
7570571   77930     3247       1
7570572   77930     3396       1
7570573   77930     3424       1
7570574   77930     3698       1

[1889864 rows x 3 columns]
25439


In [194]:
print(len(users))
print(len(items))
print(len(labels))

9449320
9449320
9449320


In [195]:
interactions = {'userId': users, 'moveId': items, 'rating': labels}
training_data_new = pd.DataFrame(data=interactions)

In [202]:
print(training_data_new.head(6))
print(training_data_new.shape)
check_user = training_data_new[training_data_new['userId'] == 5]
print(check_user)

   userId  moveId  rating
0   10956    5421       1
1   10956  113216       0
2   10956   93267       0
3   10956    3597       0
4   10956    1903       0
5   59010     434       1
(9449320, 3)
         userId  moveId  rating
27295         5    5137       1
27296         5   27891       0
27297         5  134120       0
27298         5   26274       0
27299         5    5975       0
...         ...     ...     ...
9449305       5   52952       1
9449306       5   83337       0
9449307       5   88382       0
9449308       5   90474       0
9449309       5  160590       0

[355 rows x 3 columns]


## Recap

So far, we have been preprocessed the data, i.e. converted the explicit feedback to implicit feedback. We have also made a split, where each unique user's most recent review serves as the test set, and the rest of the data is training data. Furthermore, we generated, for each user, four samples which the user has not interacted with. Thus, we have labels for a binary classificaiton problem (1 = interacted, 0 = has not interacted)

Now, we need to:
1. Determine which users or items are similar to one another?
2. Classify whether a user is likely to interact (watch) a movie based on 1.
3. Measure / evaluate different options. 



In [None]:
import torch
from torch.utils.data import Dataset

class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [1]:

import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=4)

ModuleNotFoundError: No module named 'torch'

In [2]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1
all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

trainer = pl.Trainer(max_epochs=5, gpus=1, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

NameError: name 'ratings' is not defined

In [3]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

NameError: name 'test_ratings' is not defined