In [1]:
import os
import numpy as np 
import pandas as pd 
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import SparseAdam,Adam,Adagrad,SGD

In [2]:
COLS = ['user_id', 'movie_id', 'rating', 'timestamp']
train_data = pd.read_csv("ml-100k/u1.base",sep='\t', names=COLS)
test_data = pd.read_csv("ml-100k/u1.test",sep='\t', names=COLS)
n_users, n_items = 943,1682

### Sequential model without user information

In [3]:
class LSTMReccomenderTupleModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_items, num_output):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_output)
        self.hidden = self.init_hidden()

    def init_hidden(self):
    	# initialize both hidden layers
        return (Variable(torch.zeros(1, 1, self.hidden_dim)),
                Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sequence):
        embeddings = self.item_embeddings(sequence)
        output, self.hidden = self.lstm(embeddings.view(len(sequence), 1, -1),
                                        self.hidden)
        rating_scores = self.linear(output.view(len(sequence), -1))
        return rating_scores

    def predict(self, sequence):
        rating_scores = self.forward(sequence)
        return rating_scores

In [4]:
class SequentialModelItems():
    def __init__(self, embedding_dim = 64,hidden_dim = 128, n_output = 1, n_items=1682):
        self.model = LSTMReccomenderTupleModel(embedding_dim, hidden_dim, n_items+1, n_output)
        
    def get_pairs(self,dataset):
        def chunks(lst, n):
            for i in range(0, len(lst), n):
                yield lst[i:i + n]
        produced_dataset=[]
        unique_ids=list(set(dataset.user_id.tolist()))
        for ident in unique_ids:
            sorted_ds=dataset[dataset.user_id==ident].sort_values(by='timestamp', ascending=True)
            list_movies=sorted_ds.movie_id.tolist()
            list_ratings=list(np.asarray(sorted_ds.rating.tolist()))
            chunked_movies=list(chunks(list_movies,10))
            chunked_ratings=list(chunks(list_ratings,10))
            chunked_ratings[-1].extend([0] * (10 - len(chunked_ratings[-1])))
            chunked_movies[-1].extend([0] * (10 - len(chunked_movies[-1])))
            zipped=list(zip(chunked_movies,chunked_ratings))
            produced_dataset.extend(zipped)
        return produced_dataset

    def fit(self,training_dataset, epochs=5):
        training_dataset=np.asarray(self.get_pairs(training_dataset))
        loss_fn = nn.MSELoss()
        optimizer = Adam(self.model.parameters(), lr=0.0001)
        loss=0
        for epoch in range(epochs):
            print('EPOCH {} ; LOSS {}'.format(epoch,loss))
            for sequence, target_ratings in training_dataset:
                self.model.zero_grad()
                self.model.hidden = self.model.init_hidden()
                # convert sequence to PyTorch variables
                sequence_var = Variable(torch.LongTensor(sequence.astype('int64')))
                sequence_var=sequence_var.view(-1,1)
                # forward pass
                ratings_scores = self.model(sequence_var)
                target_ratings_var = Variable(torch.FloatTensor(target_ratings.astype('float32')))
                target_ratings_var=target_ratings_var.view(-1,1)
                # compute loss
                loss = loss_fn(ratings_scores, target_ratings_var)
                # backpropagate
                loss.backward()
                # update weights
                optimizer.step()
                
    def predict_list(self, sequence):
        return self.model.predict(torch.LongTensor(np.asarray(sequence).astype('int64')))
    
    def predict(self, feed_dict):
        res=self.model.predict(torch.LongTensor(np.asarray(feed_dict['item_id']).astype('int64')))
        return list(res.detach().numpy().transpose()[0])
    
    def user_items(self,user_id,dataset):
        return dataset[dataset.user_id==user_id].movie_id.tolist()
    
    def reccommend_for_user(self,user_id,dataset):
        user_items=self.user_items(user_id,dataset)
        predicted_rank=self.predict_list(user_items)
        df = pd.DataFrame(list(zip(user_items, predicted_rank)), 
               columns =['movie_id', 'predicted_rank']) 
        return df.sort_values(by=['predicted_rank'],ascending=False)

### Sequential model with user information

In [5]:
class LSTMRecommenderTripletsModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_items, num_users, num_output):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.lstm = nn.LSTM(2*embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_output)
        self.hidden = self.init_hidden()

    def init_hidden(self):
    	# initialize both hidden layers
        return (Variable(torch.zeros(1, 1, self.hidden_dim)),
                Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sequence_items, sequence_users):
        embeddings_items = self.item_embeddings(sequence_items)
        embeddings_users = self.item_embeddings(sequence_users)
        embeddings= torch.cat([embeddings_users, embeddings_items], 1)
        output, self.hidden = self.lstm(embeddings.view(len(sequence_items), 1, -1),
                                        self.hidden)
        rating_scores = self.linear(output.view(len(sequence_items), -1))
        return rating_scores

    def predict(self, sequence_items, sequence_users):
        rating_scores = self.forward(sequence_items, sequence_users)
        return rating_scores

In [6]:
class SequentialModelItemsAndUsers():
    def __init__(self, embedding_dim = 64,hidden_dim = 128, n_output = 1, n_items=1682,n_users=943):
        self.model = LSTMRecommenderTripletsModel(embedding_dim, hidden_dim, n_items+1,n_users+1, n_output)
        
    def get_triplets(self,dataset):
        def chunks(lst, n):
            for i in range(0, len(lst), n):
                yield lst[i:i + n]
        produced_dataset=[]
        unique_ids=list(set(dataset.user_id.tolist()))
        for ident in unique_ids:
            sorted_ds=dataset[dataset.user_id==ident].sort_values(by='timestamp', ascending=True)
            list_movies=sorted_ds.movie_id.tolist()
            list_ratings=list(np.asarray(sorted_ds.rating.tolist()))
            list_user=[ident for i in range(len(list_movies))]
            chunked_movies=list(chunks(list_movies,10))
            chunked_ratings=list(chunks(list_ratings,10))
            chunked_ids=list(chunks(list_user,10))
            chunked_ratings[-1].extend([0] * (10 - len(chunked_ratings[-1])))
            chunked_movies[-1].extend([0] * (10 - len(chunked_movies[-1])))
            chunked_ids[-1].extend([0] * (10 - len(chunked_ids[-1])))
            zipped=list(zip(chunked_movies,chunked_ratings,chunked_ids))
            produced_dataset.extend(zipped)
        return produced_dataset

    def fit(self,training_dataset, epochs=5):
        training_dataset=np.asarray(self.get_triplets(training_dataset))
        loss_fn = nn.MSELoss()
        optimizer = Adam(self.model.parameters(), lr=0.0001)
        loss=0
        for epoch in range(epochs):
            print('EPOCH {} ; LOSS {}'.format(epoch,loss))
            for sequence_items,target_ratings, sequence_users in training_dataset:
                self.model.zero_grad()
                self.model.hidden = self.model.init_hidden()
                # convert sequence to PyTorch variables
                sequence_items_var = Variable(torch.LongTensor(sequence_items.astype('int64')))
                sequence_users_var = Variable(torch.LongTensor(sequence_users.astype('int64')))
                sequence_items_var=sequence_items_var.view(-1,1)
                sequence_users_var=sequence_users_var.view(-1,1)
                # forward pass
                ratings_scores = self.model(sequence_items_var,sequence_users_var)
                target_ratings_var = Variable(torch.FloatTensor(target_ratings.astype('float32')))
                target_ratings_var=target_ratings_var.view(-1,1)
                # compute loss
                loss = loss_fn(ratings_scores, target_ratings_var)
                # backpropagate
                loss.backward()
                # update weights
                optimizer.step()
                
    def predict_list(self, sequence_items, sequence_users):
        return self.model.predict(torch.LongTensor(np.asarray(sequence_items).astype('int64')),torch.LongTensor(np.asarray(sequence_users).astype('int64')))
    
    def predict(self, feed_dict):
        res=self.model.predict(torch.LongTensor(np.asarray(feed_dict['item_id']).astype('int64')),torch.LongTensor(np.asarray(feed_dict['user_id']).astype('int64')))
        return list(res.detach().numpy().transpose()[0])
    
    def user_items_tuple(self,user_id,dataset):
        filtered=dataset[dataset.user_id==user_id]
        return filtered.movie_id.tolist(),filtered.user_id.tolist()
    
    def reccommend_for_user(self,user_id,dataset):
        items,users=self.user_items_tuple(user_id,dataset)
        predicted_rank=self.predict_list(items,users)
        df = pd.DataFrame(list(zip(items, predicted_rank)), 
               columns =['movie_id', 'predicted_rank']) 
        return df.sort_values(by=['predicted_rank'],ascending=False)

### Models training

#### Training sets do not require to be sorted ( they are sorted and splitted internally)

In [7]:
seqModel2=SequentialModelItems()
seqModel2.fit(train_data,epochs=7)

EPOCH 0 ; LOSS 0
EPOCH 1 ; LOSS 2.9284868240356445
EPOCH 2 ; LOSS 2.319791555404663
EPOCH 3 ; LOSS 2.0916574001312256
EPOCH 4 ; LOSS 1.9914429187774658
EPOCH 5 ; LOSS 1.9151272773742676
EPOCH 6 ; LOSS 1.8313179016113281


In [38]:
seqModel3=SequentialModelItemsAndUsers()
seqModel3.fit(train_data,epochs=7)

EPOCH 0 ; LOSS 0
EPOCH 1 ; LOSS 3.1749045848846436
EPOCH 2 ; LOSS 2.4581174850463867
EPOCH 3 ; LOSS 1.8393666744232178
EPOCH 4 ; LOSS 1.5326457023620605
EPOCH 5 ; LOSS 1.3921520709991455
EPOCH 6 ; LOSS 1.3025230169296265


### Tests

#### Ratings in test set must be sorted within users by timestamp !!!

In [39]:
sorted_test_data=test_data.sort_values(by=['user_id','timestamp'])

In [40]:
def get_top_n_for_user(user_id,n,dataset):
    return dataset[dataset.user_id==user_id].sort_values(by=['rating'], ascending=False).head(n)

In [41]:
get_top_n_for_user(1,20,sorted_test_data)

Unnamed: 0,user_id,movie_id,rating,timestamp
92,1,196,5,874965677
95,1,202,5,875072442
42,1,96,5,875072716
34,1,81,5,875072865
80,1,174,5,875073198
50,1,108,5,875240920
68,1,150,5,876892196
3,1,14,5,874965706
82,1,177,5,876892701
78,1,170,5,876892856


In [42]:
seqModel3.reccommend_for_user(1,sorted_test_data).head(20)

Unnamed: 0,movie_id,predicted_rank
41,23,"[tensor(4.1626, grad_fn=<SelectBackward>)]"
105,12,"[tensor(4.1510, grad_fn=<SelectBackward>)]"
27,98,"[tensor(4.1478, grad_fn=<SelectBackward>)]"
46,134,"[tensor(4.1066, grad_fn=<SelectBackward>)]"
36,185,"[tensor(4.0940, grad_fn=<SelectBackward>)]"
26,64,"[tensor(4.0792, grad_fn=<SelectBackward>)]"
94,265,"[tensor(3.9574, grad_fn=<SelectBackward>)]"
49,186,"[tensor(3.9472, grad_fn=<SelectBackward>)]"
108,208,"[tensor(3.9377, grad_fn=<SelectBackward>)]"
72,170,"[tensor(3.8939, grad_fn=<SelectBackward>)]"


In [13]:
seqModel2.reccommend_for_user(1,sorted_test_data).head(10)

Unnamed: 0,movie_id,predicted_rank
46,134,"[tensor(4.0938, grad_fn=<SelectBackward>)]"
49,186,"[tensor(4.0055, grad_fn=<SelectBackward>)]"
41,23,"[tensor(3.9705, grad_fn=<SelectBackward>)]"
37,56,"[tensor(3.9678, grad_fn=<SelectBackward>)]"
44,84,"[tensor(3.9644, grad_fn=<SelectBackward>)]"
50,188,"[tensor(3.9629, grad_fn=<SelectBackward>)]"
45,184,"[tensor(3.9572, grad_fn=<SelectBackward>)]"
129,221,"[tensor(3.9493, grad_fn=<SelectBackward>)]"
38,96,"[tensor(3.9433, grad_fn=<SelectBackward>)]"
48,97,"[tensor(3.9386, grad_fn=<SelectBackward>)]"


### How to predict ?

##### lists

In [14]:
seqModel2.predict_list(sorted_test_data.movie_id.tolist())

tensor([[3.8941],
        [3.9205],
        [3.9375],
        ...,
        [3.7074],
        [3.8353],
        [3.7019]], grad_fn=<AddmmBackward>)

In [15]:
seqModel3.predict_list(sorted_test_data.movie_id.tolist(),sorted_test_data.user_id.tolist())

tensor([[3.2652],
        [3.2102],
        [3.3175],
        ...,
        [3.4286],
        [3.9038],
        [3.5249]], grad_fn=<AddmmBackward>)

##### dict

In [16]:
feed_dict={'user_id':sorted_test_data.user_id.tolist(), 'item_id': np.array(sorted_test_data.movie_id.tolist())}

In [17]:
seqModel2.predict(feed_dict)

[3.8319108,
 3.846501,
 3.8766775,
 3.8598316,
 3.8322628,
 4.03856,
 3.8439035,
 3.8163867,
 3.822704,
 3.7308633,
 3.6850598,
 3.5998073,
 3.6239045,
 3.7160885,
 3.6649816,
 3.5520048,
 3.7644804,
 3.6304111,
 3.5727503,
 3.740289,
 3.7372892,
 3.8235924,
 3.6875393,
 3.6593099,
 3.6756852,
 3.73042,
 3.7695243,
 3.9039078,
 3.8675597,
 3.9793808,
 3.872669,
 3.8697538,
 3.8386917,
 3.841914,
 3.886632,
 3.96991,
 3.995641,
 4.116791,
 4.063075,
 4.0356035,
 3.9407442,
 4.040572,
 4.0110884,
 4.000478,
 4.022271,
 4.012355,
 4.1342106,
 3.9319694,
 3.982066,
 4.04272,
 3.9997303,
 3.899889,
 3.8291402,
 3.8042805,
 3.7467167,
 3.8622248,
 3.742757,
 3.7051775,
 3.6774466,
 3.702121,
 3.5168905,
 3.561638,
 3.4795957,
 3.5693657,
 3.4386585,
 3.62767,
 3.680071,
 3.637584,
 3.7569063,
 3.797395,
 3.8246276,
 3.8924315,
 3.914514,
 3.786567,
 3.83066,
 3.7844265,
 3.795789,
 3.7727606,
 3.6579258,
 3.689099,
 3.7924557,
 3.8052113,
 3.7254715,
 3.805698,
 3.7620018,
 3.7365716,
 3.768

In [18]:
seqModel3.predict(feed_dict)

[3.7265854,
 3.7202563,
 3.7787983,
 3.6731281,
 3.7462504,
 3.6291852,
 3.5614676,
 3.5124688,
 3.650086,
 3.5393775,
 3.5939615,
 3.3536096,
 3.3142533,
 3.3999019,
 3.2862632,
 3.091951,
 3.3130527,
 3.1778448,
 3.2622979,
 3.4755716,
 3.3562126,
 3.6869776,
 3.7648475,
 3.72937,
 3.723263,
 3.8779402,
 3.7598693,
 3.9585283,
 3.847313,
 3.9251812,
 3.7352607,
 3.609797,
 3.5335355,
 3.539591,
 3.6183925,
 3.7008736,
 3.6635873,
 3.731172,
 3.735754,
 3.6456947,
 3.4652965,
 3.6983469,
 3.737363,
 3.731348,
 3.745563,
 3.5672262,
 3.8502898,
 3.534627,
 3.3347,
 3.381678,
 3.5836105,
 3.4328103,
 3.3689227,
 3.4910922,
 3.330913,
 3.3545315,
 3.3265796,
 3.3956163,
 3.4147115,
 3.2517033,
 3.0081167,
 3.1220393,
 3.0062456,
 3.138073,
 2.9210467,
 3.1868885,
 3.2476792,
 3.1457992,
 3.3089328,
 3.4076722,
 3.4755483,
 3.4675686,
 3.5600178,
 3.4178457,
 3.6030867,
 3.5368338,
 3.651566,
 3.3590548,
 3.0944986,
 3.1392684,
 3.310743,
 3.4310303,
 3.4303827,
 3.3034344,
 3.0532422,
 2

## Leave one out evaluation

In [16]:
def model_test(model, file_name):
    COLS = ['user_id', 'movie_id', 'rating', 'timestamp']
    train_data = pd.read_csv("Data/movielens.train_explicit_ds",sep='\t', names=COLS)

    model.fit(train_data,epochs=7)

    test_items = np.load('Data/test_items.npy', allow_pickle=True)
    test_users = np.load('Data/test_users.npy', allow_pickle=True)
    predictions = []
    for users, items in zip(test_users, test_items):
        feed_dict={'user_id': users, 'item_id': items}
        p = model.predict(feed_dict)
        predictions.append(p)
    
    np.save('Predictions/'+file_name, predictions)

In [17]:
model_test(SequentialModelItemsAndUsers(), 'seq_iu')
model_test(SequentialModelItems(), 'seq_i')

EPOCH 0 ; LOSS 0
EPOCH 1 ; LOSS 2.852264881134033
EPOCH 2 ; LOSS 2.5232255458831787
EPOCH 3 ; LOSS 2.180002450942993
EPOCH 4 ; LOSS 1.9156014919281006
EPOCH 5 ; LOSS 1.7311371564865112
EPOCH 6 ; LOSS 1.5907890796661377
EPOCH 0 ; LOSS 0
EPOCH 1 ; LOSS 2.810211658477783
EPOCH 2 ; LOSS 2.1997873783111572
EPOCH 3 ; LOSS 1.945755124092102
EPOCH 4 ; LOSS 1.8014581203460693
EPOCH 5 ; LOSS 1.702870488166809
EPOCH 6 ; LOSS 1.624518871307373
