### datasets : https://grouplens.org/datasets/movielens 

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Machine de Bolzmane
import pandas as pd
import numpy as np
import torch
import torch.nn as nn # contruire des réseaux de neurones
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data 
from torch.autograd import Variable

In [4]:
movies = pd.read_csv("ml-1m/movies.dat",sep="::", header=None, engine="python",encoding="latin-1")

In [5]:
movies # identiant && nom de film && type de film 

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [6]:
users = pd.read_csv("ml-1m/users.dat",sep="::", header=None, engine="python",encoding="latin-1")

In [7]:
users # Identifiant && sex (F || M) && ages && categorie socio-prof && code postal

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [8]:
ratings = pd.read_csv("ml-1m/ratings.dat",sep="::", header=None, engine="python",encoding="latin-1")

In [9]:
ratings # Identifiant users && numero de film  && la notes && timestamp 

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [10]:
## Préparation des données training
training_set = pd.read_csv("ml-100k/u1.base",delimiter="\t",header=None)

In [11]:
training_set=np.array(training_set,dtype="int") # convertir en array

In [12]:
training_set

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [15]:
test_set = pd.read_csv("ml-100k/u1.test",delimiter="\t",header=None)
test_set=np.array(test_set,dtype="int")

In [16]:
# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [17]:
# Converting the data into an array with users in lines and movies in columns
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

In [18]:
# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [22]:
# Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x): # encoder et décoder l'informatio jusqu'à la sortie
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5) ## weight_decay = le taux pour diminiuer l'apprentissage

In [20]:
# Training the SAE
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0) # unsqueeze : ajouter une dime
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

epoch: 1 loss: 1.7715216420328952
epoch: 2 loss: 1.0968416144595083
epoch: 3 loss: 1.0534766252575039
epoch: 4 loss: 1.0384800752288426
epoch: 5 loss: 1.0307393042081323
epoch: 6 loss: 1.0265650557182449
epoch: 7 loss: 1.0238976805201137
epoch: 8 loss: 1.0218469846613543
epoch: 9 loss: 1.0211554585666096
epoch: 10 loss: 1.0197087685778412
epoch: 11 loss: 1.0186750954902462
epoch: 12 loss: 1.0182622608496072
epoch: 13 loss: 1.0177925236162308
epoch: 14 loss: 1.0175210403323567
epoch: 15 loss: 1.0172530580307153
epoch: 16 loss: 1.016725302413842
epoch: 17 loss: 1.0166457709612216
epoch: 18 loss: 1.0164272238803829
epoch: 19 loss: 1.0161379040051388
epoch: 20 loss: 1.0162436473073095
epoch: 21 loss: 1.0157986933535323
epoch: 22 loss: 1.0159995524629746
epoch: 23 loss: 1.0157513720211122
epoch: 24 loss: 1.0158052900917953
epoch: 25 loss: 1.0157917434175396
epoch: 26 loss: 1.0155794677098062
epoch: 27 loss: 1.015430355904924
epoch: 28 loss: 1.0151310021444757
epoch: 29 loss: 1.0126245797900

In [21]:
# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user])
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[0, target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.item()*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: 0.9489126494257722
