## Data preprocessing

In [1]:
# Import libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

from google.colab import drive

In [43]:
# Connect drive
drive.mount('/content/gdrive')
drive_path = '/content/gdrive/MyDrive/Boltzmann_machines'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [46]:
# Load sets
# Paths
movies_file = os.path.join(drive_path, 'ml-1m/movies.dat')
users_file = os.path.join(drive_path, 'ml-1m/users.dat')
ratings_file = os.path.join(drive_path, 'ml-1m/ratings.dat')

#Load
movies = pd.read_csv(movies_file, sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv(users_file, sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv(ratings_file, sep = '::', header = None, engine = 'python', encoding = 'latin-1')

# Display
print(' MOVIES', movies.head(3), '\n\n', 'USERS', users.head(3), '\n\n', 'RATINGS', ratings.head(3))

 MOVIES    0                        1                             2
0  1         Toy Story (1995)   Animation|Children's|Comedy
1  2           Jumanji (1995)  Adventure|Children's|Fantasy
2  3  Grumpier Old Men (1995)                Comedy|Romance 

 USERS    0  1   2   3      4
0  1  F   1  10  48067
1  2  M  56  16  70072
2  3  M  25  15  55117 

 RATINGS    0     1  2          3
0  1  1193  5  978300760
1  1   661  3  978302109
2  1   914  3  978301968


In [82]:
# Load training and test sets

# Paths
training_set_file = os.path.join(drive_path, 'ml-100k/u1.base')
test_set_file = os.path.join(drive_path, 'ml-100k/u1.test')

# Load
# 0th is row, 1st column = user, 2nd = movie, 3rd = rating, 4th = timestamp
training_set = pd.read_csv(training_set_file, delimiter = '\t')
test_set = pd.read_csv(test_set_file, delimiter = '\t')

# Display
print(' TRAINING', training_set.head(3), '\n\n', 'TESTING', test_set.head(3))

# Turn to arrays (same values still)
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

 TRAINING    1  1.1  5  874965758
0  1    2  3  876893171
1  1    3  4  878542960
2  1    4  3  876893119 

 TESTING    1   6  5  887431973
0  1  10  3  875693118
1  1  12  5  878542960
2  1  14  5  874965706


In [83]:
# Get total number of users and movies, across
# Gives total across train and test data (cross-validation)
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [84]:
# Convert data into array with a user on each line and movies in columns
def convert(data):
    # Create list of list. Each list corresponds to a user, and their movie ratings
    new_data = []
    # Add ratings into user's list
    for id_users in range(1, nb_users + 1):
      id_movies = data[:, 1][data[:, 0] == id_users]
      id_ratings = data[:, 2][data[:, 0] == id_users]
      # Fill with zeros
      ratings = np.zeros(nb_movies)
      # Replace zeros with real ratings
      ratings[id_movies - 1] = id_ratings
      new_data.append(list(ratings))
    return new_data

# Contains 943 rows of lists. In each list is the user's ratings of each movie
# Moves without a rating just have a 0
training_set = convert(training_set)
training_set = convert(test_set)

In [85]:
# Convert data into Torch tensors to enable manipulation in PyTorch
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

Up to here, all the data preprocessing could be used for other types of models. After here, it's specific to Botlzmann models

In [88]:
# Convert ratings into binary (1 = liked, 0 = not liked, -1 = no rating)
training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1

test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

## Boltzmann model

In [None]:
class RBM():
  # Self reference, visible nodes, hidden nodes
  def __init__(self, nv, nh):
      # Initialise weights
      self.W = torch.randn(nh, nv)
      # Probability of hidden nodes, given visible nodes. 1 = batch, nh = bias
      self.a = torch.randn(1, nh)
      # Bias for visible nodes
      self.b = torch.randn(1, nv)