## Auto Encoder

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

### Importing the dataset

In [2]:
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, engine='python', encoding='latin-1')
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, engine='python', encoding='latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [3]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head() 

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


***User 1 rated movie 1193 a 5/5. The last column is a timestamp***

### Preparing training and test set

In [6]:
training_set = pd.read_csv('ml-100k/u1.base', sep='\t')
training_set = np.array(training_set, dtype = 'int') #we need to convert int so we can use pytorch tensors
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [7]:
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

### Getting the number of users and movies

***We want the max of users in the training and test***

In [8]:
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0]))) #total number of users
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))#total number of movies

In [9]:
nb_users

943

In [10]:
nb_movies

1682

***This is the max users and movies for this training set***

### Converting the data into an array with users in lines and movies in columns

In [11]:
def convert(data): 
    """
    This creates a list of lists. This will create a obseration of each user for the movies they have seen, we will 
    mark unseen movies as zero.
    """
    new_data = [] #initialize a new list
    for id_users in range(1,nb_users+1): #create a list for each user. We want to consider all users so we add a +1
        id_movies = data[:,1][data[:,0] == id_users]  #contains all indexs of movies that were rated
        id_ratings = data[:,2][data[:,0] == id_users] #we will get all the id_users for each ratings
        ratings = np.zeros(nb_movies) #we zero out all the movies, if someone rated it, we replace it
        ratings[id_movies-1] = id_ratings #we have this from id_movies. We need to start at 0 since this is a list
        new_data.append(ratings)
    return new_data

In [12]:
training_set = convert(training_set)
test_set = convert(test_set)

In [13]:
for i in range(5):
    print('User: ',i,'ratings of movies',training_set[i])

User:  0 ratings of movies [0. 3. 4. ... 0. 0. 0.]
User:  1 ratings of movies [4. 0. 0. ... 0. 0. 0.]
User:  2 ratings of movies [0. 0. 0. ... 0. 0. 0.]
User:  3 ratings of movies [0. 0. 0. ... 0. 0. 0.]
User:  4 ratings of movies [0. 0. 0. ... 0. 0. 0.]


### Converting the data into torch tensors

***Tensors are multidimensional matrixes, which we will need for this type of data***

In [14]:
training_set = torch.FloatTensor(training_set) #This argument takes a list of lists. This is why we created the convert method
test_set = torch.FloatTensor(test_set)

  training_set = torch.FloatTensor(training_set) #This argument takes a list of lists. This is why we created the convert method


In [15]:
for i in range(5):
    print('User: ',i,'ratings of movies',training_set[i])

User:  0 ratings of movies tensor([0., 3., 4.,  ..., 0., 0., 0.])
User:  1 ratings of movies tensor([4., 0., 0.,  ..., 0., 0., 0.])
User:  2 ratings of movies tensor([0., 0., 0.,  ..., 0., 0., 0.])
User:  3 ratings of movies tensor([0., 0., 0.,  ..., 0., 0., 0.])
User:  4 ratings of movies tensor([0., 0., 0.,  ..., 0., 0., 0.])


### Creating the architecture of a Neural Network

In [32]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__() #we want inheritence of the parent class
        self.fc1 = nn.Linear(nb_movies, 20) #first full connection to our autoencoder. The first input feature is the # of movies, second input is the amount of nodes we create. We experiment with this
        self.fc2 = nn.Linear(20, 10) #we want 10 neurons in the second layer based on the previous connection we made
        self.fc3 = nn.Linear(10, 20) #now we are doing some deconstruction
        self.fc4 = nn.Linear(20, nb_movies) #more deconstruction
        self.activation = nn.Sigmoid()
    def forward(self, x): #this will do the forward propogation, x is our input vector of features
        x = self.activation(self.fc1(x)) #we are taking the input vector of features and transforming it
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x) #specificity of the auto-encoders, we do not need to use activation on last connection, this is now a vector of predicted ratings
        return x

In [37]:
sae = SAE()
criterion = nn.MSELoss() #we will use this to measure the mean squared error, this comes from nn module
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) #we need to input paramters, learning rate, decay: regulate convergence

### Training the SAE

In [39]:
nb_epoch = 200 #we can choose more or less
for epoch in range(1, nb_epoch + 1): #in each epoch, we will loop over each observation. One loop to loop over each epoch, the other to loop over. each user
    train_loss = 0 #we need to keep track of the loss
    s = 0. #this will be used to compute the root squared mean error
    for id_user in range(nb_users): #this will loop over all the users
        input = Variable(training_set[id_user]).unsqueeze(0) #this is a vector of 1 dimension, so I modified it by adding another dimension. This is the batch
        target = input.clone() #we want to now create the target
        if torch.sum(target.data > 0)> 0: #this will only look at users who rated at least one movie. This will save us a lot of memory
            output =  sae(input) #we want a vector of predicted ratings. This will call the forward method in the sae class
            target.require_grad = False #we dont want to compute gradient descent for target since we already are for input
            output[target == 0] = 0 #we want to not include movie ratings that a user didnt see when calcuating error
            loss = criterion(output, target) #this computes the loss error. Input is real and predicted
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) #we only want movies that have non zero ratings. This last part makes sure our den is never zero, avoid infinte computation
            loss.backward() #we will call the backward method, this increases or decreases weights, decides direction
            train_loss += np.sqrt(loss.data*mean_corrector) #updating the train/loss
            s += 1.
            optimizer.step() #this will apply optimizer to update weights. This decides the intensity of the weights
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

epoch: 1 loss: tensor(1.0966)
epoch: 2 loss: tensor(1.0534)
epoch: 3 loss: tensor(1.0384)
epoch: 4 loss: tensor(1.0308)
epoch: 5 loss: tensor(1.0264)
epoch: 6 loss: tensor(1.0241)
epoch: 7 loss: tensor(1.0216)
epoch: 8 loss: tensor(1.0207)
epoch: 9 loss: tensor(1.0194)
epoch: 10 loss: tensor(1.0189)
epoch: 11 loss: tensor(1.0185)
epoch: 12 loss: tensor(1.0178)
epoch: 13 loss: tensor(1.0175)
epoch: 14 loss: tensor(1.0172)
epoch: 15 loss: tensor(1.0170)
epoch: 16 loss: tensor(1.0166)
epoch: 17 loss: tensor(1.0164)
epoch: 18 loss: tensor(1.0161)
epoch: 19 loss: tensor(1.0161)
epoch: 20 loss: tensor(1.0158)
epoch: 21 loss: tensor(1.0159)
epoch: 22 loss: tensor(1.0159)
epoch: 23 loss: tensor(1.0159)
epoch: 24 loss: tensor(1.0156)
epoch: 25 loss: tensor(1.0155)
epoch: 26 loss: tensor(1.0152)
epoch: 27 loss: tensor(1.0150)
epoch: 28 loss: tensor(1.0139)
epoch: 29 loss: tensor(1.0122)
epoch: 30 loss: tensor(1.0097)
epoch: 31 loss: tensor(1.0098)
epoch: 32 loss: tensor(1.0056)
epoch: 33 loss: t

***We can expect a difference of .91 between each rating on average***

### Testing the SAE

In [45]:
test_loss = 0 
s = 0. 
for id_user in range(nb_users): 
        input = Variable(training_set[id_user]).unsqueeze(0) 
        target = Variable(test_set[id_user]).unsqueeze(0) #the real ratings of the test set
        if torch.sum(target.data > 0)> 0: 
            output =  sae(input) 
            target.require_grad = False 
            output[target == 0] = 0 
            loss = criterion(output, target) 
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) 
            test_loss += np.sqrt(loss.data*mean_corrector) 
            s += 1.
print('loss: '+str(test_loss/s))

loss: tensor(0.9530)


***We predicted the test set with a difference of .95 between each rating on average***

### Predicitons for a single user

In [46]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
 
user_input = Variable(training_set[user_id]).unsqueeze(0)
predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)
 
# Join all info in one dataset
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

In [47]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,GoldenEye (1995),3.0,3.876193
1,Dracula: Dead and Loving It (1995),5.0,4.519506
2,Nixon (1995),5.0,4.009555
3,Sense and Sensibility (1995),3.0,3.28435
4,Money Train (1995),4.0,3.493617
...,...,...,...
131,Legends of the Fall (1994),2.0,2.84911
132,Major Payne (1994),4.0,4.160089
133,Little Odessa (1994),1.0,2.43795
134,My Crazy Life (Mi vida loca) (1993),4.0,2.576608


***The person with user id = 0 is pritned with their movie ratings and what the model predicted it would be***