# Boltzman Machine

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
movies = pd.read_csv('./ml-1m/movies.dat', sep='::',
                     header=None, engine='python', encoding='latin-1')
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None,
                    engine='python', encoding='latin-1')
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::',
                      header=None, engine='python', encoding='latin-1')

In [3]:
print(f'{movies.head()}')
print('\n-------------------------------------------------------------------\n')
print(f'{movies.describe()}')
print('\n-------------------------------------------------------------------\n')
print(f'{movies.info()}')
print('\n-------------------------------------------------------------------\n')
print(f'{movies.columns}')

   0                                   1                             2
0  1                    Toy Story (1995)   Animation|Children's|Comedy
1  2                      Jumanji (1995)  Adventure|Children's|Fantasy
2  3             Grumpier Old Men (1995)                Comedy|Romance
3  4            Waiting to Exhale (1995)                  Comedy|Drama
4  5  Father of the Bride Part II (1995)                        Comedy

-------------------------------------------------------------------

                 0
count  3883.000000
mean   1986.049446
std    1146.778349
min       1.000000
25%     982.500000
50%    2010.000000
75%    2980.500000
max    3952.000000

-------------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
0    3883 non-null int64
1    3883 non-null object
2    3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None

-----------------------

In [4]:
print(f'{users.head()}')
print('\n-------------------------------------------------------------------\n')
print(f'{users.describe()}')
print('\n-------------------------------------------------------------------\n')
print(f'{users.info()}')
print('\n-------------------------------------------------------------------\n')
print(f'{users.columns}')

   0  1   2   3      4
0  1  F   1  10  48067
1  2  M  56  16  70072
2  3  M  25  15  55117
3  4  M  45   7  02460
4  5  M  25  20  55455

-------------------------------------------------------------------

                 0            2            3
count  6040.000000  6040.000000  6040.000000
mean   3020.500000    30.639238     8.146854
std    1743.742145    12.895962     6.329511
min       1.000000     1.000000     0.000000
25%    1510.750000    25.000000     3.000000
50%    3020.500000    25.000000     7.000000
75%    4530.250000    35.000000    14.000000
max    6040.000000    56.000000    20.000000

-------------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
0    6040 non-null int64
1    6040 non-null object
2    6040 non-null int64
3    6040 non-null int64
4    6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB
None

--------------------------

In [5]:
print(f'{ratings.head()}')
print('\n-------------------------------------------------------------------\n')
print(f'{ratings.describe()}')
print('\n-------------------------------------------------------------------\n')
print(f'{ratings.info()}')
print('\n-------------------------------------------------------------------\n')
print(f'{ratings.columns}')

   0     1  2          3
0  1  1193  5  978300760
1  1   661  3  978302109
2  1   914  3  978301968
3  1  3408  4  978300275
4  1  2355  5  978824291

-------------------------------------------------------------------

                  0             1             2             3
count  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06
mean   3.024512e+03  1.865540e+03  3.581564e+00  9.722437e+08
std    1.728413e+03  1.096041e+03  1.117102e+00  1.215256e+07
min    1.000000e+00  1.000000e+00  1.000000e+00  9.567039e+08
25%    1.506000e+03  1.030000e+03  3.000000e+00  9.653026e+08
50%    3.070000e+03  1.835000e+03  4.000000e+00  9.730180e+08
75%    4.476000e+03  2.770000e+03  4.000000e+00  9.752209e+08
max    6.040000e+03  3.952000e+03  5.000000e+00  1.046455e+09

-------------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
0    1000209 non-null int64
1    1000209

In [6]:
for num in range(len(movies.columns)):
    print(movies.iloc[:,num].value_counts())

2047    1
2656    1
2712    1
661     1
2708    1
657     1
2704    1
653     1
2700    1
649     1
2696    1
645     1
2692    1
641     1
2688    1
637     1
2684    1
633     1
2680    1
629     1
2676    1
625     1
2672    1
621     1
2668    1
617     1
2664    1
613     1
2660    1
665     1
       ..
3323    1
3379    1
1306    1
1330    1
3375    1
1326    1
3371    1
1322    1
3367    1
1318    1
3363    1
1314    1
3359    1
1310    1
3355    1
3351    1
1278    1
1302    1
3347    1
1298    1
3343    1
1294    1
3339    1
1290    1
3335    1
1286    1
3331    1
1282    1
3327    1
2049    1
Name: 0, Length: 3883, dtype: int64
Newton Boys, The (1998)                                 1
One Tough Cop (1998)                                    1
Hamlet (1990)                                           1
Promise, The (Versprechen, Das) (1994)                  1
D3: The Mighty Ducks (1996)                             1
Adventures of Milo and Otis, The (1986)                 1
Foreve

In [7]:
for num in range(len(users.columns)):
    print(users.iloc[:,num].value_counts())

2047    1
2724    1
2712    1
665     1
4763    1
2716    1
669     1
4767    1
2720    1
673     1
4771    1
677     1
661     1
4775    1
2728    1
681     1
4779    1
2732    1
685     1
4783    1
2736    1
689     1
4759    1
2708    1
4851    1
2692    1
2680    1
633     1
4731    1
2684    1
       ..
3423    1
5472    1
1378    1
3427    1
5476    1
1382    1
3403    1
1354    1
5448    1
3399    1
1326    1
3375    1
5424    1
1330    1
3379    1
5428    1
1334    1
3383    1
5432    1
1338    1
3387    1
5436    1
1342    1
3391    1
5440    1
1346    1
3395    1
5444    1
1350    1
2049    1
Name: 0, Length: 6040, dtype: int64
M    4331
F    1709
Name: 1, dtype: int64
25    2096
35    1193
18    1103
45     550
50     496
56     380
1      222
Name: 2, dtype: int64
4     759
0     711
7     679
1     528
17    502
12    388
14    302
20    281
2     267
16    241
6     236
10    195
3     173
15    144
13    142
11    129
5     112
9      92
19     72
18     70
8      17
Nam

In [8]:
for num in range(len(ratings.columns)):
    print(ratings.iloc[:,num].value_counts())

4169    2314
1680    1850
4277    1743
1941    1595
1181    1521
889     1518
3618    1344
2063    1323
1150    1302
1015    1286
5795    1277
4344    1271
1980    1260
2909    1258
1449    1243
4510    1240
424     1226
4227    1222
5831    1220
3391    1216
3841    1216
4508    1211
1088    1176
5367    1169
3808    1158
549     1152
1285    1144
3224    1115
3539    1078
4543    1050
        ... 
5309      20
703       20
932       20
160       20
5027      20
2292      20
1664      20
5904      20
3633      20
4365      20
4628      20
5258      20
2530      20
5168      20
5380      20
3552      20
4749      20
2819      20
6038      20
5012      20
4244      20
5525      20
3222      20
250       20
665       20
761       20
1493      20
5533      20
2714      20
2696      20
Name: 0, Length: 6040, dtype: int64
2858    3428
260     2991
1196    2990
1210    2883
480     2672
2028    2653
589     2649
2571    2590
1270    2583
593     2578
1580    2538
1198    2514
608     2513
27

In [9]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter='\t')
training_set = np.array(training_set, dtype='int')
test_set = pd.read_csv('ml-100k/u1.test', delimiter='\t')
test_set = np.array(test_set, dtype='int')

### Getting the number of users and movies

In [None]:
test_set.reshape()

In [10]:
num_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
num_movies = int(max(max(training_set[:, 1]), max(test_set[:, 1])))

In [11]:
print(f'num users = {num_users}')
print(f'num users = {num_movies}')

num users = 943
num users = 1682


### Converting the data into an array with users in lines and movies in columns

In [12]:
def convert_(data, num_users, num_movies):
    new_data = []
    for id_users in range(1, num_users + 1):
        id_movies = data[:, 1][data[:, 0] == id_users]
        id_ratings = data[:, 2][data[:, 0] == id_users]
        ratings = np.zeros(num_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

training_set = convert_(training_set, num_users, num_movies)
test_set = convert_(test_set, num_users, num_movies)

### Converting the data into Torch tensors

In [13]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

### Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)

In [14]:
training_set[training_set <= 1] = -1
training_set[training_set == 2] = 0
training_set[training_set == 3] = 0
training_set[training_set >= 4] = 1
test_set[test_set <= 1] = -1
test_set[test_set == 2] = 0
test_set[test_set == 3] = 0
test_set[test_set >= 4] = 1

### Creating the architecture of the Neural Network

- __nv__ = Number of visible nodes
- __nh__ = Number of hidden nodes
- __self.W__ = initializes all the nodes in a matrix (nh X nv) on a
               normal distribution with mean = 0, variance = 1
- __self.a__ = initializes the bias (p(h) given v) <- Bias of hidden nodes
- __self.b__ = initializes the bias (p(v) given h) <- Bias of visible nodes
- __wx__ = product of two tensors. W is weight of tensors, X is the neuron
- __activation__ = linear function of neurons. Product of two tensors + bias expanded as product of two tensors
- v0 = input vector - rating from each user
- vk = 
- ph0 = vector of probability
- phk = probabilities of hidden vectors

In [18]:
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)

    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)

    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)

    def train(self, v0, vk, ph0, phk):
        tester_torch = (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.W += tester_torch
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)

### Training the RBM

In [24]:
nv = len(training_set[0])
nh = 100
batch_size = 29
rbm = RBM(nv, nh)
num_epoch = 12

In [25]:
for epoch in range(1, (num_epoch + 1)):
    train_loss = 0
    s = 0
    for num in range(0, (num_users - batch_size), batch_size):
        vk = training_set[num:(num + batch_size)]
        v0 = training_set[num:(num + batch_size)]
        ph0, _ = rbm.sample_h(v0)
        for k in range(10):
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)
            vk[v0 < 0] = v0[v0 < 0]
        phk, _ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
        s += 1
    print((f'epoch: {epoch}, loss: {train_loss/s}'))

epoch: 1, loss: 0.43191051483154297
epoch: 2, loss: 0.4179733991622925
epoch: 3, loss: 0.417701780796051
epoch: 4, loss: 0.42119625210762024
epoch: 5, loss: 0.42068758606910706
epoch: 6, loss: 0.4216765761375427
epoch: 7, loss: 0.4183974266052246
epoch: 8, loss: 0.4204780161380768
epoch: 9, loss: 0.4182189404964447
epoch: 10, loss: 0.41963592171669006
epoch: 11, loss: 0.42202845215797424
epoch: 12, loss: 0.4204881191253662


### Testing the RBM

In [26]:
test_loss = 0
s = 0.
for id_user in range(num_users):
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt >= 0]) > 0:
        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.4428)
