In [1]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.parallel 
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
movies = pd.read_csv('movies.dat',sep = '::',header = None, engine = 'python', encoding = 'latin-1')
users =  pd.read_csv('users.dat',sep = '::',header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ratings.dat',sep = '::',header = None, engine = 'python', encoding = 'latin-1')

In [3]:
training_set = pd.read_csv('u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')#converting training set into array from dataframe
test_set = pd.read_csv('u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')
print(training_set)
print(test_set)

[[        1         2         3 876893171]
 [        1         3         4 878542960]
 [        1         4         3 876893119]
 ...
 [      943      1188         3 888640250]
 [      943      1228         3 888640275]
 [      943      1330         3 888692465]]
[[        1        10         3 875693118]
 [        1        12         5 878542960]
 [        1        14         5 874965706]
 ...
 [      459       934         3 879563639]
 [      460        10         3 882912371]
 [      462       682         5 886365231]]


In [10]:
#converting training and test set into a matrix where rows are going to be users, columns are going be movies and cells are going to be ratings
tot_users = int(max(max(training_set[:,0]),max(test_set[:,0])))
print(tot_users)
tot_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))
print(tot_movies)
def convert(data):
  #here we are going to convert into list of lists instead of a 2d array as we are going to use torch in further
  new_data = []
  for i_users in range(1,tot_users+1):
    i_movies = data[:,1][data[:,0] == i_users]
    i_ratings = data[:,2][data[:,0] == i_users]
    ratings = np.zeros(tot_movies)
    ratings[i_movies - 1] = i_ratings
    new_data.append(list(ratings))
  return new_data

training_set = convert(training_set)
test_set = convert(test_set)

print(training_set)
print(test_set)

943
1682


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
#creating the architecture of neural network i.e., stacked auto encoder
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(tot_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20) #reconstructing
        self.fc4 = nn.Linear(20, tot_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x)) #starting to encode
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x)) #starting to decode
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)


In [None]:
#training SAE
tot_epoch = 200
for epoch in range(1, tot_epoch + 1):
    train_loss = 0
    s = 0.
    for i_user in range(tot_users):
        input = Variable(training_set[i_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = tot_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data[0]*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s))


In [None]:
#testing SAE
test_loss = 0
s = 0.
for i_user in range(tot_users):
    input = Variable(training_set[i_user]).unsqueeze(0)
    target = Variable(test_set[i_user])
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = tot_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data[0]*mean_corrector)
        s += 1.
print('test_loss: ' + str(test_loss/s))
