In [1]:
import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset,TensorDataset

from tqdm import tqdm_notebook

# Model Structure

In [2]:
class MF(nn.Module):
    def __init__(self, dim, num_users, num_items):
        super(MF, self).__init__()
        self.user_embeddings = nn.Embedding(num_users+1, dim)
        self.item_embeddings = nn.Embedding(num_items+1, dim)

    def forward(self, user_id, item_id):
        user_embedding = self.user_embeddings(user_id)
        item_embedding = self.item_embeddings(item_id)
        
        cross_value = user_embedding * item_embedding
        return cross_value
    
class MLP(nn.Module):
    def __init__(self, dim, num_users, num_items, layer_sizes):
        super(MLP, self).__init__()
        self.user_embeddings = nn.Embedding(num_users+1, dim)
        self.item_embeddings = nn.Embedding(num_items+1, dim)
        
        self.linears = []
        prev_size = dim * 2
        for layer in layer_sizes:
            self.linears.append(nn.Linear(prev_size, layer))
            prev_size = layer
    
        self.relu = nn.ReLU()
    def forward(self, user_id, item_id):
        user_embedding = self.user_embeddings(user_id)
        item_embedding = self.item_embeddings(item_id)
        
        input_embedding = torch.cat((user_embedding, item_embedding), 1)
        for linear in self.linears:
            input_embedding = linear(input_embedding)
            input_embedding = self.relu(input_embedding)
        return input_embedding
    
class NCF(nn.Module):
    def __init__(self, num_users, num_items, MF_dim, MLP_dim, MLP_layers):
        super(NCF,self).__init__()
        self.mf = MF(MF_dim, num_users, num_items)
        self.mlp = MLP(MLP_dim, num_users, num_items, MLP_layers)
        
        self.linear = nn.Linear(MF_dim + MLP_layers[-1], 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user_id, item_id):
        MF_features = self.mf(user_id, item_id)
        MLP_features = self.mlp(user_id, item_id)
        
        input_features = torch.cat((MF_features, MLP_features), 1)
        X = self.linear(input_features)
        return self.sigmoid(X)

# Data

In [3]:
movie_1m_ratings = pd.read_csv("Data/ml-1m/ratings.dat", sep="::", names=['user_id', 'item_id', 'rating', 'timestamp'], engine="python")
num_users = movie_1m_ratings.user_id.unique().shape[0]
num_movies = movie_1m_ratings.item_id.unique().shape[0]

In [4]:
max_user_id = max(movie_1m_ratings.user_id)
max_movieId = max(movie_1m_ratings.item_id)

In [5]:
sparsity = 1 - len(movie_1m_ratings) / (num_users * num_movies)
print("number of users: {}, number of movies: {}, sparsity: {}".format(num_users, num_movies, sparsity))
movie_1m_ratings.head(5)

number of users: 6040, number of movies: 3706, sparsity: 0.9553163743776871


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
valid_user = []
for i, satisfied in enumerate(movie_1m_ratings.groupby("user_id").size() > 20):
    if satisfied:
        valid_user.append(i+1)

In [8]:
valid_ratings = movie_1m_ratings[movie_1m_ratings.user_id.isin(valid_user)]
valid_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
user_choice_dict = {}
for _, rating in valid_ratings.iterrows():
    user, item = rating["user_id"], rating["item_id"]
    if user not in user_choice_dict:
        user_choice_dict[user] = []
    user_choice_dict[user].append(item)

In [None]:
def generate_train_test(user_choice_dict, train_ratio, negative_count):
    X_train, y_train, X_test, y_test = [], [], [], []
    for user in user_choice_dict:
        viewed_movies = user_choice_dict[user]
        negative_sample = np.random.choice([movie for movie in range(1, num_movies+1) if movie not in viewed_movies], negative_count)
        
        #Positive train test split
        train_size, test_size = int(len(viewed_movies) * train_ratio), len(viewed_movies) - int(len(viewed_movies) * train_ratio)
        for movie in viewed_movies[:train_size]:
            X_train.append([user, movie])
            y_train.append(1)
        for movie in viewed_movies[:test_size]:
            X_test.append([user, movie])
            y_test.append(1)
        
        #Negative train test split
        train_neg_size, test_neg_size = int(len(negative_sample) * train_ratio), len(negative_sample) - int(len(negative_sample) * train_ratio)
        for movie in negative_sample[:train_size]:
            X_train.append([user, movie])
            y_train.append(0)
        for movie in negative_sample[:test_size]:
            X_test.append([user, movie])
            y_test.append(0)
        
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
    
train_X, train_y, test_X, test_y = generate_train_test(user_choice_dict, 0.9, 100)

In [None]:
train_X, train_y, test_X, test_y = torch.from_numpy(train_X), torch.from_numpy(train_y), torch.from_numpy(test_X), torch.from_numpy(test_y),
train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [71]:
def ndcg_score(true_rel, pred_rel, k=10):
    top_k = np.argsort(pred_rel)[::-1][:k]
    DCG = 0
    for i, k in enumerate(top_k):
        DCG += true_rel[k] / np.log2(i+2)
    
    
    top_real_k = np.argsort(true_rel)[::-1][:k]
    iDCG = 0
    for i, k in enumerate(top_real_k):
        iDCG += true_rel[k] / np.log2(i+2)
    #print(iDCG)
    return DCG / iDCG

In [72]:
def accuarcy(y_pred, y_true):
    return ((y_pred > 0.5) == y_true).float().mean()

def train_model(model, train_loader, test_loader, max_epoch, lr):
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr)
    epoch_acc = evaluate(model, test_loader)
    print("epoch {}, test acc: {}".format(0, epoch_acc))
    for epoch in range(max_epoch):
        epoch_loss = 0
        step = 0
        
        for batch, labels in tqdm_notebook(train_loader):
            y_pred = model(batch[:, 0], batch[:, 1])
            loss = loss_fn(y_pred, labels.float())
            epoch_loss += loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            step += 1
        hr, ndcg = evaluate(model, test_loader)
        print("epoch {} loss: {}, test hr: {}, test ndcg: {}".format(epoch+1, epoch_loss / step, hr, ndcg))
        
    return model

def evaluate(model, test_loader):
    users = {}
    with torch.no_grad():
        for batch, labels in test_loader:
            y_pred = model(batch[:, 0], batch[:, 1])
            y_pred.squeeze()
            for user, label, pred in zip(batch[:, 0], labels, y_pred):
                user, label, pred = int(user), int(label), float(pred)
                if user not in users:
                    users[user] = []
                users[user].append((pred,label))
    hr, ndcg_10 = [], []
    #print(users)
    for user in users:
        users[user].sort()
        preds = [pair[0] for pair in users[user]]
        labels = [pair[1] for pair in users[user]]
        if 1 in labels[-10:]:
            hr.append(1)
        else:
            hr.append(0)
            
        ndcg_10.append(ndcg_score(labels, preds))
    return np.mean(hr), np.mean(ndcg_10)

In [73]:
ncf = NCF(max_user_id, max_movieId, 64, 64, [32, 16])
ncf = train_model(ncf, train_loader, test_loader, 10, 0.001)

epoch 0, test acc: (0.10043668122270742, 0.04568414541542124)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 1 loss: 0.35151252150535583, test hr: 0.6036278132348001, test ndcg: 0.3921154571267811


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 2 loss: 0.29491209983825684, test hr: 0.6224386966745046, test ndcg: 0.41924952349961925


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 3 loss: 0.2856585681438446, test hr: 0.6056432650319113, test ndcg: 0.41133720946137003


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 4 loss: 0.26921546459198, test hr: 0.5779308028216326, test ndcg: 0.3917590830909779


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 5 loss: 0.24288102984428406, test hr: 0.5688612697346321, test ndcg: 0.3832185103460798


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 6 loss: 0.21018429100513458, test hr: 0.5606315082297615, test ndcg: 0.3693746430727033


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 7 loss: 0.177230566740036, test hr: 0.5757473967080954, test ndcg: 0.36965975374413385


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 8 loss: 0.14912430942058563, test hr: 0.5844810211622439, test ndcg: 0.36901428510823947


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 9 loss: 0.12632933259010315, test hr: 0.5883439704400403, test ndcg: 0.3683329548929555


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 10 loss: 0.10807961225509644, test hr: 0.579106483036614, test ndcg: 0.3527290676150416


# Experiment in the paper

In [10]:
valid_ratings.sort_values(by=["timestamp"])

Unnamed: 0,user_id,item_id,rating,timestamp
1000138,6040,858,4,956703932
1000153,6040,2384,4,956703954
999873,6040,593,5,956703954
1000007,6040,1961,4,956703977
1000192,6040,2019,5,956703977
...,...,...,...,...
825793,4958,2399,1,1046454338
825438,4958,1407,5,1046454443
825724,4958,3264,4,1046454548
825731,4958,2634,3,1046454548


In [11]:
def generate_train_test(user_choice_dict, negative_count):
    X_train, y_train, X_test, y_test = [], [], [], []
    for user in user_choice_dict:
        viewed_movies = user_choice_dict[user]
        negative_sample = np.random.choice([movie for movie in range(1, num_movies+1) if movie not in viewed_movies], negative_count+100)
        
        #Positive train test split
        for movie in viewed_movies[:-1]:
            X_train.append([user, movie])
            y_train.append(1)
        X_test.append([user, viewed_movies[-1]])
        y_test.append(1)
        
        #Negative train test split
        for movie in negative_sample[:negative_count]:
            X_train.append([user, movie])
            y_train.append(0)
        for movie in negative_sample[negative_count:]:
            X_test.append([user, movie])
            y_test.append(0)
        
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
    
train_X, train_y, test_X, test_y = generate_train_test(user_choice_dict, 50)

In [12]:
train_X, train_y, test_X, test_y = torch.from_numpy(train_X), torch.from_numpy(train_y), torch.from_numpy(test_X), torch.from_numpy(test_y),
train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [74]:
ncf = NCF(max_user_id, max_movieId, 64, 64, [32, 16])
ncf = train_model(ncf, train_loader, test_loader, 2, 0.001)

epoch 0, test acc: (0.09119919381928115, 0.04032472144764332)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 1 loss: 0.3476249575614929, test hr: 0.5945582801477998, test ndcg: 0.3962543592397766


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40320.0), HTML(value='')))


epoch 2 loss: 0.2937599718570709, test hr: 0.6241182398387639, test ndcg: 0.4119669393931554
