In [1]:
# import packages
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

In [2]:
# set random seed
np.random.seed(123)

In [3]:
# load data from local archive/rating.csv
def loadData():
    ratings = pd.read_csv('archive/rating.csv', parse_dates=['timestamp'])
    return ratings

ratings = loadData()

In [4]:
# set a percentage of data. In the following program, only use num% of raw data.
cutDataPercent = 0.3

In [5]:
# only use num% of data, here we use 30%
def cutData(num, ratings):
    rand_userIds = np.random.choice(ratings['userId'].unique(), size=int(len(ratings['userId'].unique())*num), replace=False)
    ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
    return ratings

ratings = cutData(cutDataPercent, ratings)

In [6]:
# show dataframe: ratings head
print(ratings.head())

     userId  movieId  rating           timestamp
236       3        1     4.0 1999-12-11 13:36:47
237       3       24     3.0 1999-12-14 12:54:08
238       3       32     4.0 1999-12-11 13:14:07
239       3       50     5.0 1999-12-11 13:13:38
240       3      160     3.0 1999-12-14 12:54:08


In [7]:
# ratings shape, 6027314 rows and 4 columns
print(ratings.shape)

(6027314, 4)


In [8]:
# ratings has 41547 unique users
print(len(pd.unique(ratings['userId'])))

41547


In [9]:
# ratings has 21917 unique movies
print(len(pd.unique(ratings['movieId'])))

21917


In [10]:
# each user rated 275 movies on average
print(ratings.shape[0] / len(pd.unique(ratings['movieId'])))

275.0063421088653


In [11]:
# split train and test according to the timestamp. 
# the latest rating interaction between a user and a movie is a test sample
def trainTestSplit(ratings):
    ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method = 'first', ascending=False)
    train_ratings = ratings[ratings['rank_latest'] != 1]
    test_ratings = ratings[ratings['rank_latest'] == 1]
    train_ratings = train_ratings[['userId', 'movieId', 'rating']]
    test_ratings = test_ratings[['userId', 'movieId', 'rating']]
    return train_ratings, test_ratings

train_ratings, test_ratings = trainTestSplit(ratings)

In [12]:
print(train_ratings.shape)

(5985767, 3)


In [13]:
print(test_ratings.shape)

(41547, 3)


In [14]:
# the number of test samples is equalt to about 0.694% the number of train samples
print(test_ratings.shape[0] / train_ratings.shape[0])

0.006940965126106646


In [15]:
# we get rid of the timestamp columns now
print(train_ratings.head())

     userId  movieId  rating
236       3        1     4.0
237       3       24     3.0
238       3       32     4.0
239       3       50     5.0
240       3      160     3.0


In [16]:
# getting the maximum of userId and movieId and all unique movieId
# valuables will be used later
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1
all_movieIds = ratings['movieId'].unique()
print(num_users)
print(num_items)
print(all_movieIds)

138492
131259
[    1    24    32 ... 63692 73202 86664]


In [17]:
# presentation:
# Baseline example
# converting the dataset into an implicit feedback dataset
# label all ratings as 1
present1_train_rating = train_ratings
present1_train_rating.loc[:, 'rating'] = 1
print(present1_train_rating.sample(5))

          userId  movieId  rating
16817522  116317     3464       1
9236185    63832     1333       1
11659680   80516    54286       1
14469299   99977    48516       1
865895      5784     5387       1


In [18]:
# presentation:
# Improved example:
# we designed to keep the rating value
# but only label these ratings that are greater than the user's quantile(num) value to 1
# In this case, we do one more step than the baseline example
# our defined implicit feedback dataset makes sure that users like the movie
user_rating_set = ratings[['userId', 'rating']]
user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.5]).unstack().reset_index()
print(user_rating.head())

   userId  0.5
0       3  4.0
1       6  4.0
2      10  4.0
3      11  4.5
4      17  4.0


In [19]:
# Another try with quantile value settings to 0.25
# we can see that userId 6 and 11 has a lower like movie rating number 
user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.25]).unstack().reset_index()
print(user_rating.head())

   userId  0.25
0       3   4.0
1       6   3.0
2      10   4.0
3      11   3.5
4      17   4.0


In [37]:
# In our designed algorithms, we labeled the rating of the train_rating dataset to 1
# when the rating value is greater than the user's quantile(num) rating value
# for example:
present2_train_rating = train_ratings

rating_q = {}
for i in range(len(user_rating)):
    rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]

In [50]:
for i in range(len(present2_train_rating)):
    if present2_train_rating.iloc[i]['rating'] >= rating_q[present2_train_rating.iloc[i]['userId']]:
        present2_train_rating.iloc[i]['rating'] = 1
    else:
        present2_train_rating.iloc[i]['rating'] = 0

In [58]:
# here is a random sample that one rating is labeled to 1 with our algorithm
print(present2_train_rating.sample(5))

          userId  movieId  rating
6823733    46956      529       0
12464981   86121     1207       0
15309064  105836     1912       0
10722756   74142    79166       1
14249366   98420     3698       0


In [59]:
# so far, we have successfully generated our label 1 (positive) samples
# now, we are going to add label 0 (negative) samples
# here we set the ratio of the number of positive samples to the number of negative samples equals to 1:3
ratio = 3

In [60]:
present_users, present_items, present_labels = [], [], []
present_user_item_set = set(zip(ratings['userId'], ratings['movieId']))
ratio = 3
for u, i in present_user_item_set:
    present_users.append(u)
    present_items.append(i)
    present_labels.append(1)
    for _ in range(ratio):
        negative_item = np.random.choice(all_movieIds)
        while (u, negative_item) in present_user_item_set:
            negative_item = np.random.choice(all_movieIds)
        present_users.append(u)
        present_items.append(negative_item)
        present_labels.append(0)

In [61]:
# now our train dataset looked like this
for i in range(20):
    print(f"{i} users:{present_users[i]} movieId:{present_items[i]} label:{present_labels[i]}")

0 users:5070 movieId:1270 label:1
1 users:5070 movieId:25800 label:0
2 users:5070 movieId:96724 label:0
3 users:5070 movieId:3159 label:0
4 users:12009 movieId:3476 label:1
5 users:12009 movieId:43244 label:0
6 users:12009 movieId:107079 label:0
7 users:12009 movieId:89908 label:0
8 users:109462 movieId:3265 label:1
9 users:109462 movieId:113203 label:0
10 users:109462 movieId:114760 label:0
11 users:109462 movieId:980 label:0
12 users:15617 movieId:1419 label:1
13 users:15617 movieId:54262 label:0
14 users:15617 movieId:8649 label:0
15 users:15617 movieId:91503 label:0
16 users:73201 movieId:435 label:1
17 users:73201 movieId:108216 label:0
18 users:73201 movieId:878 label:0
19 users:73201 movieId:97946 label:0


In [62]:
# Then, we combined our algorithm to the class MovieLensTrainDataset
# besides the original base algorithm
# we have our algorithms with other five different set of parameters:
class MovieLensTrainDataset(Dataset):
    """MovieLens Pytorch Dataset for Training
    Args:
        ratings(pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    """
    def __init__(self, ratings, all_movieIds, algorithm):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds, algorithm)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__ (self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]
    
    def get_dataset(self, ratings, all_movieIds, algorithm):
        users, items, labels = [], [], []
        
        # Algorithm 1 is the BASELINE with number of negatives as 3
        if algorithm == 1:
            user_item_set = set(zip(ratings['userId'], ratings['movieId']))
            num_negatives = 3
            for u, i in user_item_set:
                users.append(u)
                items.append(i)
                labels.append(1)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        # Algorithm 2 is the first test with number of negatives as 2 and quantile 0.5
        elif algorithm == 2:
            user_item_set = set(zip(ratings['userId'], ratings['movieId'], ratings['rating']))
            user_rating_set = ratings[['userId', 'rating']]
            user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.5]).unstack().reset_index()
            rating_q = {}
            num_negatives = 2
            for i in range(len(user_rating)):
                rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]
            for u, i, r in user_item_set:
                users.append(u)
                items.append(i)
                if r >= rating_q[u]:
                    labels.append(1)
                else:
                    labels.append(0)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item, r) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        # Algorithm 3 is the first test with number of negatives as 3 and quantile 0.25
        elif algorithm == 3:
            user_item_set = set(zip(ratings['userId'], ratings['movieId'], ratings['rating']))
            user_rating_set = ratings[['userId', 'rating']]
            user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.25]).unstack().reset_index()
            rating_q = {}
            num_negatives = 3
            for i in range(len(user_rating)):
                rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]
            for u, i, r in user_item_set:
                users.append(u)
                items.append(i)
                if r >= rating_q[u]:
                    labels.append(1)
                else:
                    labels.append(0)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item, r) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        # Algorithm 4 is the first test with number of negatives as 4 and quantile 0.25
        elif algorithm == 4:
            user_item_set = set(zip(ratings['userId'], ratings['movieId'], ratings['rating']))
            user_rating_set = ratings[['userId', 'rating']]
            user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.25]).unstack().reset_index()
            rating_q = {}
            num_negatives = 4
            for i in range(len(user_rating)):
                rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]
            for u, i, r in user_item_set:
                users.append(u)
                items.append(i)
                if r >= rating_q[u]:
                    labels.append(1)
                else:
                    labels.append(0)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item, r) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        # Algorithm 5 is the first test with number of negatives as 5 and quantile 0.25
        elif algorithm == 5:
            user_item_set = set(zip(ratings['userId'], ratings['movieId'], ratings['rating']))
            user_rating_set = ratings[['userId', 'rating']]
            user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.25]).unstack().reset_index()
            rating_q = {}
            num_negatives = 5
            for i in range(len(user_rating)):
                rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]
            for u, i, r in user_item_set:
                users.append(u)
                items.append(i)
                if r >= rating_q[u]:
                    labels.append(1)
                else:
                    labels.append(0)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item, r) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        # Algorithm 6 is the first test with number of negatives as 6 and quantile 0.1
        elif algorithm == 6:
            user_item_set = set(zip(ratings['userId'], ratings['movieId'], ratings['rating']))
            user_rating_set = ratings[['userId', 'rating']]
            user_rating = user_rating_set.groupby('userId')['rating'].quantile([0.1]).unstack().reset_index()
            rating_q = {}
            num_negatives = 6
            for i in range(len(user_rating)):
                rating_q[user_rating.iloc[i][0]] = user_rating.iloc[i][1]
            for u, i, r in user_item_set:
                users.append(u)
                items.append(i)
                if r >= rating_q[u]:
                    labels.append(1)
                else:
                    labels.append(0)
                for _ in range(num_negatives):
                    negative_item = np.random.choice(all_movieIds)
                    while (u, negative_item, r) in user_item_set:
                        negative_item = np.random.choice(all_movieIds)
                    users.append(u)
                    items.append(negative_item)
                    labels.append(0)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [63]:
# Next the following function is our model: Neural Collective Filter
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds, algorithm):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        self.algorithm = algorithm
        
    def forward(self, user_input, item_input):
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds, self.algorithm), batch_size=512, num_workers=4)

In [64]:
# The following is our Test function
# The method name is "Hit 10"
def testing(ratings, test_ratings, all_movieIds, model):
    # First, we get user-item pairs for testing
    # Which we prepared earlier
    # the test samples are users' latest rating
    test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

    # Next, we dict of all items that are interacted with by each user
    user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

    # Then, we create a hits list
    # First, we know that for a user, the user rated movie A
    # Next, we get the list of movies that the user never interactive with
    # We then combine the 99 un-interactive negative samples with 1 test positive sample
    # By running the NCF model, we will get a label value from 0 to 1 for each case
    # The larger the label values is, the more positive the sample will be
    # Therefore, we sort the result list, and then get the top 10 movies
    # If the test sample movie is in the top 10 predicted movies, we treat the result as hit
    hits = []
    for (u,i) in test_user_item_set:
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_movieIds) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        test_items = selected_not_interacted + [i]
        
        predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                            torch.tensor(test_items)).detach().numpy())
        
        top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
        
        if i in top10_items:
            hits.append(1)
        else:
            hits.append(0)
            
    # Finally, for all users, we get an average of their hits of a specific user.
    # the higher the hit ratio is, the more presice our model is
    print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

In [65]:
print("Method1:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=1)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.1 M 
1 | item_embedding | Embedding | 1.1 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.645     Total estimated model params size (MB)


Method1:


Training: 0it [00:00, ?it/s]

RuntimeError: DataLoader worker (pid(s) 16688) exited unexpectedly

In [None]:
print("Method2:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=2)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)

In [None]:
print("Method3:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=3)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)

In [None]:
print("Method4:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=4)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)

In [None]:
print("Method5:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=5)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)

In [None]:
print("Method6:")
model = NCF(num_users, num_items, train_ratings, all_movieIds, algorithm=6)
trainer = pl.Trainer(max_epochs=5, gpus=None, reload_dataloaders_every_epoch=True,
                 progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)
trainer.fit(model)
testing(ratings, test_ratings, all_movieIds, model)