In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split

from tqdm.notebook import tnrange, tqdm_notebook
from sklearn.metrics import mean_squared_error

In [2]:
ratings = pd.read_csv('data/rating.csv')

In [3]:
ratings.drop(ratings.columns[[-1]], axis=1, inplace=True)

# Get the user with at least 5 ratings
relevant_users = ratings.groupby("userId", as_index=False).agg({'movieId':len})
relevant_users = relevant_users[relevant_users.movieId > 4].drop('movieId', axis=1)

dataset = ratings.merge(relevant_users)

# Get 100 most rated movies
top_100 = ratings.movieId.value_counts()[:100].index.values
dataset = dataset[dataset['movieId'].isin(top_100)]

In [4]:
idx2movie = dataset.movieId.unique()
movie2idx = {movie:idx for idx, movie in enumerate(idx2movie)}

dataset['movieId'] = dataset['movieId'].apply(lambda movie: movie2idx[movie])

idx2user = dataset.userId.unique()
user2idx = {user:idx for idx, user in enumerate(idx2user)}

dataset['userId'] = dataset['userId'].apply(lambda user: user2idx[user])

In [5]:
dataset.head()

Unnamed: 0,userId,movieId,rating
2,0,0,3.5
3,0,1,3.5
4,0,2,3.5
8,0,3,4.0
9,0,4,4.0


In [6]:
dataset = np.array(dataset, dtype = 'int')

In [7]:
nb_users = len(user2idx)
nb_movies = 100

In [8]:
def Convert(data):
    new_data = []
    for id_users in tnrange(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

ratings_matrix = Convert(dataset)

  0%|          | 0/134428 [00:00<?, ?it/s]

In [9]:
test = np.zeros((len(user2idx), len(movie2idx)))
train = np.array(ratings_matrix)

In [10]:
for user_train, user_test in zip(train, test):
        
        nonzero = user_train.nonzero()[0]
        
        # Select 20% for testing
        size = nonzero.shape[0]//5
        test_ratings = np.random.choice(nonzero, size=size, replace=False)
        
        # Keep the records for testing
        user_test[test_ratings] = user_train[test_ratings]
        # Zero out for training
        user_train[test_ratings] = 0

In [11]:
# Converting the data into Torch tensors
train_matrix_tensor = torch.FloatTensor(train)
test_matrix_tensor = torch.FloatTensor(test)

In [12]:
class AutoEnconderNN(nn.Module):

    def __init__(self):
        """Define the main components of the network including the loss and optimizer."""
        super(AutoEnconderNN, self).__init__()

        self.encoder = nn.Sequential(
            
            nn.Linear(nb_movies, 80),
            nn.ReLU(),
            
            nn.Linear(80, 40),
            nn.ReLU()
        )

        self.decoder = nn.Sequential(
            
            nn.Linear(40, 80),
            nn.ReLU(),
            
            nn.Linear(80, nb_movies)
        )
        
        self.cross_entropy_loss = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.parameters(), lr=.01, momentum=0.9)
    
    def forward(self, x):
        """Perform forward pass."""
        
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def predict(self, x):
        '''
        Get hard class predictions from the 
        feature data
        '''
        predictions = self.forward(x)
        
        return predictions

In [13]:
AE = AutoEnconderNN()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(AE.parameters(), lr = 0.01, weight_decay = 0.5)

In [14]:
train_dataloader = DataLoader(train_matrix_tensor, batch_size=1000, shuffle=True)

In [15]:
nb_epoch = 25
for epoch in tnrange(0, nb_epoch, desc="Total epochs: "):
    train_loss = 0
    s = 0.
    for users in train_dataloader:
        input = Variable(users).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = AE(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))
    

Total epochs:   0%|          | 0/25 [00:00<?, ?it/s]

epoch: 0 loss: tensor(0.1007)
epoch: 1 loss: tensor(0.0380)
epoch: 2 loss: tensor(0.0292)
epoch: 3 loss: tensor(0.0292)
epoch: 4 loss: tensor(0.0292)
epoch: 5 loss: tensor(0.0292)
epoch: 6 loss: tensor(0.0292)
epoch: 7 loss: tensor(0.0292)
epoch: 8 loss: tensor(0.0292)
epoch: 9 loss: tensor(0.0292)
epoch: 10 loss: tensor(0.0292)
epoch: 11 loss: tensor(0.0292)
epoch: 12 loss: tensor(0.0292)
epoch: 13 loss: tensor(0.0292)
epoch: 14 loss: tensor(0.0292)
epoch: 15 loss: tensor(0.0292)
epoch: 16 loss: tensor(0.0292)
epoch: 17 loss: tensor(0.0292)
epoch: 18 loss: tensor(0.0292)
epoch: 19 loss: tensor(0.0292)
epoch: 20 loss: tensor(0.0292)
epoch: 21 loss: tensor(0.0292)
epoch: 22 loss: tensor(0.0292)
epoch: 23 loss: tensor(0.0292)
epoch: 24 loss: tensor(0.0291)


In [16]:
test_loss = 0
s = 0.
for id_user in tnrange(nb_users):
        
    input = Variable(test_matrix_tensor[id_user]).unsqueeze(0)
    target = Variable(train_matrix_tensor[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = AE(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

  0%|          | 0/134428 [00:00<?, ?it/s]

test loss: tensor(0.8870)


In [17]:
sample = Variable(test_matrix_tensor[0]).unsqueeze(0)

In [18]:
true_vals = Variable(train_matrix_tensor[0]).unsqueeze(0)

In [19]:
pred = AE(sample).detach().numpy()

In [20]:
np.sqrt(mean_squared_error(true_vals, pred))

3.5398285