#  Get test data ready

In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

import torch
import torchvision as tv
from torchvision import datasets, transforms
from torch.utils.data import DataLoader as DL

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import resample

In [29]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
df_data = pd.read_csv("ml-100k/u.data",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

In [30]:
#import of moviedata
cols = ["movie id","movie title","release date","video release date","IMDb URL","unknown",
        "Action","Adventure","Animation","Children's","Comedy","Crime","Documentary",
        "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
        "Thriller","War","Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [31]:
#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [32]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#the bins are of unequal size due to repeating values in a bin
df_user['age_group'].value_counts()

(6.0, 20.0]     109
(23.0, 26.0]    105
(35.0, 40.0]    100
(31.0, 35.0]     98
(29.0, 31.0]     96
(40.0, 46.0]     94
(46.0, 51.0]     93
(20.0, 23.0]     92
(51.0, 73.0]     85
(26.0, 29.0]     71
Name: age_group, dtype: int64

In [33]:
#categorize age_group, gender and occupation
df_user['age_group'] = LE.fit_transform(df_user['age_group'])
df_user['gender'] = LE.fit_transform(df_user['gender'])
df_user['occupation'] = LE.fit_transform(df_user['occupation'])

# Neural Network Part 

In [34]:
#create class for the neural network

'''
fully connected layer = fc
nn.Linear(input, ouput)
initial input =  number of columns = 21
middle layers = 3 layers of 64 neurons
final output = number of ratings (0-5) = 6
'''

class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(21, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 64)
        self.fc5 = nn.Linear(64, 64)
        self.fc6 = nn.Linear(64, 6)

#ReLU activation function on hidden layers
#Use log_softmax for output to get probability for classes
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        
        return F.log_softmax(x, dim=1)
    
#view created network

net = Net()
net

Net(
  (fc1): Linear(in_features=21, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=6, bias=True)
)

# Check with test sets

In [35]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
u_train = pd.read_csv("ml-100k/u1.base",sep="\t",names=cols,header=None,encoding="ISO-8859-1")
u_test = pd.read_csv("ml-100k/u1.test",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

##keep changing above files to u2.base, u2.test, etc.

In [36]:
#join all three dataframes
training = pd.merge(pd.merge(u_train,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

testing = pd.merge(pd.merge(u_test,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

In [37]:
#drop unneccessary features
training.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

testing.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

testing.rating.value_counts()

4    6778
3    5182
5    4457
2    2192
1    1391
Name: rating, dtype: int64

In [38]:
#balance the training ratings using downsampling
ns = 5000
training_1 = training[training.rating == 1]
training_2 = training[training.rating == 2]
training_3 = training[training.rating == 3]
training_4 = training[training.rating == 4]
training_5 = training[training.rating == 5]

#downsample all classes except 1 to 4719
training_2_downsampled = resample(training_2,
                                 replace = False,
                                 n_samples = ns)
training_3_downsampled = resample(training_3,
                                 replace = False,
                                 n_samples = ns)
training_4_downsampled = resample(training_4,
                                 replace = False,
                                 n_samples = ns)
training_5_downsampled = resample(training_5,
                                 replace = False,
                                 n_samples = ns)

training = pd.concat([training_1, training_2_downsampled,
                     training_3_downsampled,
                     training_4_downsampled,
                     training_5_downsampled])

training.rating.value_counts()

5    5000
4    5000
3    5000
2    5000
1    4719
Name: rating, dtype: int64

In [39]:
#balance the testing ratings using downsampling
ns = 1000
testing_1 = testing[testing.rating == 1]
testing_2 = testing[testing.rating == 2]
testing_3 = testing[testing.rating == 3]
testing_4 = testing[testing.rating == 4]
testing_5 = testing[testing.rating == 5]

#downsample all classes except 1 to 1391
testing_2_downsampled = resample(testing_2,
                                 replace = False,
                                 n_samples = ns)
testing_3_downsampled = resample(testing_3,
                                 replace = False,
                                 n_samples = ns)
testing_4_downsampled = resample(testing_4,
                                 replace = False,
                                 n_samples = ns)
testing_5_downsampled = resample(testing_5,
                                 replace = False,
                                 n_samples = ns)

testing = pd.concat([testing_1, testing_2_downsampled,
                     testing_3_downsampled,
                     testing_4_downsampled,
                     testing_5_downsampled])

testing.rating.value_counts()

1    1391
3    1000
2    1000
5    1000
4    1000
Name: rating, dtype: int64

In [40]:
#resetting the index
training.reset_index(inplace = True, drop = True)
testing.reset_index(inplace = True, drop = True)

In [41]:
#categorize age_group, gender and occupation
training['age_group'] = LE.fit_transform(training['age_group'])
training['gender'] = LE.fit_transform(training['gender'])
training['occupation'] = LE.fit_transform(training['occupation'])

testing['age_group'] = LE.fit_transform(testing['age_group'])
testing['gender'] = LE.fit_transform(testing['gender'])
testing['occupation'] = LE.fit_transform(testing['occupation'])

In [42]:
#prepare data for PyTorch
rank_train = training['rating'].values
training_input = training.drop(["rating"], axis=1)
train = []

for index,row in training_input.iterrows():
    t = (torch.tensor(row.values), rank_train[index])
    train.append(t)
train = tuple(train)

rank_test = testing['rating'].values
testing_input = testing.drop(["rating"], axis=1)

test = []

for index,row in testing_input.iterrows():
    t = (torch.tensor(row.values), rank_test[index])
    test.append(t)
test = tuple(test)

In [43]:
#divide into batches
'''
batch_size = how many inputs to pass to model at a time
shuffle = to shuffle inputs or not
'''
trainset = DL(train, batch_size=32, shuffle=True)
testset = DL(test, batch_size=32, shuffle=True)

In [44]:
#lr = learning rate = 0.001
opt = optim.Adam(net.parameters(), lr = 0.001)

#EPOCHS = number of times to iterate over dataset
EPOCHS = 5

#train the network
'''
loss = error
zero_grad() = makes gradient zero after batch
nll_loss = calculates loss to update weights
if data is 1 hot vector, use mean squared error
backward() = propogate the weights backward
opt.step() = adjusts the weights
'''
for epoch in range(EPOCHS):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net(X.view(-1, 21).float())
        loss = F.nll_loss(output, y)
        loss.backward()
        opt.step() 
    print(loss)

tensor(1.6264, grad_fn=<NllLossBackward>)
tensor(1.5517, grad_fn=<NllLossBackward>)
tensor(1.5462, grad_fn=<NllLossBackward>)
tensor(1.5498, grad_fn=<NllLossBackward>)
tensor(1.5764, grad_fn=<NllLossBackward>)


In [45]:
#check the model
'''
no_grad() = as test data will not be used for optimization,
we do not need to calculate gradient for it
'''
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1, 21).float())
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ",correct/total)

Accuracy:  0.26470042663698756
