In [1]:
#import libraries
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

### Load data

In [2]:
# load ratings data
ratings_df = pd.read_csv("../input/movielens-1m-dataset/ratings.dat", delimiter = "::", header = None, 
                        names = ["userId", "movieId", "rating", "timestamp"], parse_dates = ["timestamp"])
ratings_df.head(2)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


### Data preprocessing 
#### According to leave-one-out validation strategy 

In [3]:
rand_userIds = np.random.choice(ratings_df['userId'].unique(), 
                                size=int(len(ratings_df['userId'].unique())*0.3), 
                                replace=False)

ratings_df = ratings_df.loc[ratings_df['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings_df), len(rand_userIds)))

There are 300162 rows of data from 1812 users


In [4]:
ratings_df['timestamp'] = ratings_df['timestamp'].apply(datetime.fromtimestamp)

In [5]:
ratings_df.sample(2)

Unnamed: 0,userId,movieId,rating,timestamp
817968,4912,3051,4,2000-07-04 02:21:53
594694,3620,1475,2,2000-10-30 00:55:32


#### Fetch the latest rating for each user as test data

In [6]:
ratings_df['rank_latest'] = ratings_df.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
799,10,2622,5,2000-12-31 02:03:32,218.0
800,10,648,4,2000-12-31 01:08:45,382.0
801,10,2628,3,2000-12-31 02:06:48,210.0
802,10,3358,5,2000-12-31 01:32:58,321.0
803,10,3359,3,2000-12-31 01:45:25,280.0


#### Convert all the ratings/interactions into 1 denoting clicked on item as we are considering implicit feedback 

In [8]:
ratings_df["rating"] = 1

In [9]:
ratings_df["rating"].head()

799    1
800    1
801    1
802    1
803    1
Name: rating, dtype: int64

### Train-test split
#### The most recent rating as test and the remaining as train for each user

In [10]:
train_ratings_df = ratings_df[ratings_df['rank_latest'] != 1]
test_ratings_df = ratings_df[ratings_df['rank_latest'] == 1]

In [11]:
# drop columns that we no longer need
train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]
test_ratings_df = test_ratings_df[['userId', 'movieId', 'rating']]

In [12]:
len(train_ratings_df)

298350

In [13]:
len(test_ratings_df)

1812

In [14]:
test_ratings_df.userId.nunique()

1812

In [15]:
#train_ratings_df.loc[:, 'rating'] = 1

train_ratings_df.sample(5)

Unnamed: 0,userId,movieId,rating
873081,5271,1230,1
727426,4352,2431,1
727268,4351,3566,1
150292,968,1912,1
36809,245,3918,1


In [16]:
train_ratings_df.rating.unique()

array([1])

In [17]:
test_ratings_df.rating.unique()

array([1])

### Leave-one-out validation

In [18]:
# Get a list of all movie IDs
all_movieIds = ratings_df['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings_df['userId'], train_ratings_df['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 5

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/298350 [00:00<?, ?it/s]

In [19]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings_df, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings_df, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings_df, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings_df['userId'], ratings_df['movieId']))

        num_negatives = 5
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

### The NCF model

In [20]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings_df, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=8)
        self.fc2 = nn.Linear(in_features=8, out_features = 8)
        self.output = nn.Linear(in_features=8, out_features=1)
        self.ratings = ratings_df
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=5)

In [21]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
799,10,2622,1,2000-12-31 02:03:32,218.0
800,10,648,1,2000-12-31 01:08:45,382.0


### Train the model

In [22]:
num_users = ratings_df['userId'].max()+1
num_items = ratings_df['movieId'].max()+1

all_movieIds = ratings_df['movieId'].unique()

model = NCF(num_users, num_items, train_ratings_df, all_movieIds)

In [23]:
num_users

6041

In [24]:
trainer = pl.Trainer(max_epochs=20, gpus=1, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

  "`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed in v1.6."


Training: -1it [00:00, ?it/s]

### Evaluating recommendation system using NCF based on Hit Ratio (HR@k) for top-k movies 

For each user, randomly select 99 items that the user has not interacted with.
Combine these 99 items with the test item (the actual item that the user last interacted with). We now have 100 items.
Run the model on these 100 items, and rank them according to their predicted probabilities.
Select the top 10 items from the list of 100 items. If the test item is present within the top 10 items, then we say that this is a hit.
Repeat the process for all users. The Hit Ratio is then the average hits.

In [25]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings_df['userId'], test_ratings_df['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings_df.groupby('userId')['movieId'].apply(list).to_dict()

hits = []

for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.3f}".format(np.average(hits)))

  0%|          | 0/1812 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.449
