## Recommendation System for Movies in Pytorch using the Boltzmann Machine

#### Load Data

In [1]:
!wget https://github.com/ENSP-AI-Mentoring/Restricted-Boltzmann-machine-RBM-/raw/main/dataset.zip

--2021-11-27 18:12:59--  https://github.com/ENSP-AI-Mentoring/Restricted-Boltzmann-machine-RBM-/raw/main/dataset.zip
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ENSP-AI-Mentoring/Restricted-Boltzmann-machine-RBM-/main/dataset.zip [following]
--2021-11-27 18:13:00--  https://raw.githubusercontent.com/ENSP-AI-Mentoring/Restricted-Boltzmann-machine-RBM-/main/dataset.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6583411 (6.3M) [application/zip]
Saving to: ‘dataset.zip’


2021-11-27 18:13:00 (94.0 MB/s) - ‘dataset.zip’ saved [6583411/6583411]



In [2]:
!unzip dataset.zip

Archive:  dataset.zip
   creating: dataset/
   creating: dataset/ml-1m/
  inflating: dataset/ml-1m/README    
  inflating: dataset/ml-1m/movies.dat  
  inflating: dataset/ml-1m/ratings.dat  
  inflating: dataset/ml-1m/users.dat  
   creating: dataset/ml-100k/
  inflating: dataset/ml-100k/u1.test  
  inflating: dataset/ml-100k/u1.base  


#### Loading librairies

In [13]:
# import the Libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import variable
import random
import os

#### Part 1: Data Preprocessing

In [4]:
# import the dataset
movies = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header = None, engine='python', encoding='latin-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header = None, engine='python', encoding='latin-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header = None, engine='python', encoding='latin-1')

In [5]:
# preparing the training set and test set
training_set = pd.read_csv('dataset/ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('dataset/ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [6]:
# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [7]:
# converting the data into array with users in lines and movies in column
def convert(data):
    new_data = []
    for id_users in range(1, nb_users+1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

In [8]:
# converting the data into torch tensor
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [9]:
# converting the ratings into binary rating 0 for (dislike) and 1 for (like)
training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >=3 ] = 1
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >=3 ] = 1

#### For reproductibility

In [14]:
# Set the seed value all over the place to make this reproducible.
seed  = 30

def seed_value(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed_value(seed)

#### Part 2: Building the Restricted Boltzman Machine

In [10]:
# Creating the architecture of the Neural Network
class RBM():
    # nv - number of visible nodes
    # nh - number of hidden  nodes
    # W - wigths
    # a - bias of hidden nodes given visible nodes
    # b - bias of visible nodes given hidden nodes
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)
        
    # sample the probabilites of hidden nodes given visible nodes
    # x - visable neuron v with given probability p_h_given_v 
    # mm - product of two Torch tensors
    # t() - transpose
    # self.a.expand_as(wx) - we make sure that the bias are applied to each line of the minibatch
    # p_h_given_v - probability that hidden node is activated to given visable node (that is we will find some pattern)
    # bernoulli - our outcame is binary, a user likes a movie or not. bernoulli gives a vector wicth consits from 1s and 0s. 1 coresponds that a neuron was activated and 0 wasn't
    # bernoulli:
    # 1 with probability p
    # 0 with probability 1-p
    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    # sample the probabilites of visible nodes given hidden nodes
    # y - visable neuron
    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    # Contrastive Divergence
    # v0 - rating of all movies of one user
    # vk - visible nodes obtained after case samplings
    # ph0 probabilities that  at the first iteration the hidden node = 1 with givven v0
    # phk probabilities that  at the first iteration the hidden node = 1 with givven vk (after k-samling)
    def train(self, v0, vk, ph0, phk):
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)

nv = len(training_set[0])
nh = 100
batch_size = 100
rbm = RBM(nv, nh)

#### Part 3: Training and Testing the RBM model

In [11]:
# Training the RBM model
nb_epoch = 10
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(0, nb_users - batch_size, batch_size):
        vk = training_set[id_user:id_user+batch_size]
        v0 = training_set[id_user:id_user+batch_size]
        ph0,_ = rbm.sample_h(v0)
        
        # loop for Contrastive Divergence
        for k in range(10):
            _,hk = rbm.sample_h(vk)
            _,vk = rbm.sample_v(hk)
            # we freeze nodes with rating = -1 
            vk[v0<0] = v0[v0<0]
            
        phk,_ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
        s += 1.
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

epoch: 1 loss: tensor(0.3492)
epoch: 2 loss: tensor(0.2108)
epoch: 3 loss: tensor(0.2468)
epoch: 4 loss: tensor(0.2468)
epoch: 5 loss: tensor(0.2467)
epoch: 6 loss: tensor(0.2494)
epoch: 7 loss: tensor(0.2474)
epoch: 8 loss: tensor(0.2476)
epoch: 9 loss: tensor(0.2466)
epoch: 10 loss: tensor(0.2447)


In [12]:
# Testing the RBM Model
test_loss = 0
s = 0.
for id_user in range(nb_users):
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.2508)
