#  Get test data ready

In [1557]:
import pandas as pd

import torch
import torchvision as tv
from torchvision import datasets, transforms
from torch.utils.data import DataLoader as DL

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import resample

In [1558]:
#import of moviedata
cols = ["movie id","movie title","release date","video release date","IMDb URL","unknown",
        "Action","Adventure","Animation","Childrens","Comedy","Crime","Documentary",
        "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
        "Thriller","War","Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [1559]:
#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [1560]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#the bins are of unequal size due to repeating values in a bin
#df_user['age_group'].value_counts()

# Check with test sets

In [1561]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
u_train = pd.read_csv("ml-100k/u5.base",sep="\t",names=cols,header=None,encoding="ISO-8859-1")
u_test = pd.read_csv("ml-100k/u5.test",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

##keep changing above files to u2.base, u2.test, etc.

In [1562]:
#join all three dataframes
training = pd.merge(pd.merge(u_train,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

testing = pd.merge(pd.merge(u_test,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

In [1563]:
#categorize age_group, gender and occupation using 1-hot encoder
training['age_group'] = pd.Categorical(training['age_group'])
training['gender'] = pd.Categorical(training['gender'])
training['occupation'] = pd.Categorical(training['occupation'])

age_group_dummies = pd.get_dummies(training['age_group'])
gender_dummies = pd.get_dummies(training['gender'])
occupation_dummies = pd.get_dummies(training['occupation'])

training = pd.concat([training,
                age_group_dummies,
                gender_dummies,
                occupation_dummies], axis=1)

training.drop(['age_group',
        'gender',
        'occupation'], axis=1, inplace=True)

#verify categorization
#training.head()

In [1564]:
#categorize age_group, gender and occupation using 1-hot encoder
testing['age_group'] = pd.Categorical(testing['age_group'])
testing['gender'] = pd.Categorical(testing['gender'])
testing['occupation'] = pd.Categorical(testing['occupation'])

age_group_dummies = pd.get_dummies(testing['age_group'])
gender_dummies = pd.get_dummies(testing['gender'])
occupation_dummies = pd.get_dummies(testing['occupation'])

testing = pd.concat([testing,
                age_group_dummies,
                gender_dummies,
                occupation_dummies], axis=1)

testing.drop(['age_group',
        'gender',
        'occupation'], axis=1, inplace=True)

#verify categorization
#testing.head()

In [1565]:
#drop unneccessary features
training.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

testing.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

In [1566]:
#print(training.rating.value_counts())
#print(testing.rating.value_counts())

In [1567]:
#balance the training ratings using upsampling
ns = 25000
training_1 = training[training.rating == 1]
training_2 = training[training.rating == 2]
training_3 = training[training.rating == 3]
training_4 = training[training.rating == 4]
training_5 = training[training.rating == 5]

training_1_upsampled = resample(training_1,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_2_upsampled = resample(training_2,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_3_upsampled = resample(training_3,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_4_upsampled = resample(training_4,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_5_upsampled = resample(training_5,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)

training = pd.concat([training_1_upsampled,
                training_2_upsampled,
                training_3_upsampled,
                training_4_upsampled,
                training_5_upsampled])

#training.rating.value_counts()

In [1568]:
#balance the testing ratings using upsampling
ns = 6500
testing_1 = testing[testing.rating == 1]
testing_2 = testing[testing.rating == 2]
testing_3 = testing[testing.rating == 3]
testing_4 = testing[testing.rating == 4]
testing_5 = testing[testing.rating == 5]

testing_1_upsampled = resample(testing_1,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_2_upsampled = resample(testing_2,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_3_upsampled = resample(testing_3,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_4_upsampled = resample(testing_4,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_5_upsampled = resample(testing_5,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)

testing = pd.concat([testing_1_upsampled,
                testing_2_upsampled,
                testing_3_upsampled,
                testing_4_upsampled,
                testing_5_upsampled])

#testing.rating.value_counts()

In [1569]:
#resetting the index
training.reset_index(inplace = True, drop = True)
testing.reset_index(inplace = True, drop = True)

In [1570]:
#print(training.shape)
#print(testing.shape)

In [1571]:
#prepare data for PyTorch

n_input = training.shape[1] - 1

rank_train = training['rating'].values
training_input = training.drop(["rating"], axis=1)
train = []

for index,row in training_input.iterrows():
    t = (torch.tensor(row.values), rank_train[index])
    train.append(t)
train = tuple(train)

rank_test = testing['rating'].values
testing_input = testing.drop(["rating"], axis=1)

test = []

for index,row in testing_input.iterrows():
    t = (torch.tensor(row.values), rank_test[index])
    test.append(t)
test = tuple(test)

# Neural Network 

In [1572]:
#create class for the neural network

'''
fully connected layer = fc
nn.Linear(input, ouput)
initial input =  number of columns = 51
middle layers = 3 layers of 64 neurons
final output = number of ratings (0-5) = 6
'''

n_hidden_neurons = int((2*n_input/3)+6)

class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_input, n_hidden_neurons)
        self.fc2 = nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.fc3 = nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.fc4 = nn.Linear(n_hidden_neurons, 6)

#ReLU activation function on hidden layers
#Use log_softmax for output to get probability for classes
    
    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        x = self.fc4(x)
        
        return F.log_softmax(x, dim=1)
    
#view created network

net = Net()
#net

In [1573]:
#divide into batches
'''
batch_size = how many inputs to pass to model at a time
shuffle = to shuffle inputs or not
'''
trainset = DL(train, batch_size=64, shuffle=True)
testset = DL(test, batch_size=64, shuffle=True)

In [1574]:
#lr = learning rate = 0.001
opt = optim.Adam(net.parameters(), lr = 0.001)

#EPOCHS = number of times to iterate over dataset
EPOCHS = 10

#train the network
'''
loss = error
zero_grad() = makes gradient zero after batch
nll_loss = calculates loss to update weights
if data is 1 hot vector, use mean squared error
backward() = propogate the weights backward
opt.step() = adjusts the weights
'''
for epoch in range(EPOCHS):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net(X.view(-1, n_input).float())
        loss = F.nll_loss(output, y)
        loss.backward()
        opt.step() 
    print(loss)

tensor(1.5542, grad_fn=<NllLossBackward>)
tensor(1.4800, grad_fn=<NllLossBackward>)
tensor(1.4322, grad_fn=<NllLossBackward>)
tensor(1.6306, grad_fn=<NllLossBackward>)
tensor(1.4752, grad_fn=<NllLossBackward>)
tensor(1.4655, grad_fn=<NllLossBackward>)
tensor(1.6238, grad_fn=<NllLossBackward>)
tensor(1.3882, grad_fn=<NllLossBackward>)
tensor(1.2404, grad_fn=<NllLossBackward>)
tensor(1.2477, grad_fn=<NllLossBackward>)


In [1575]:
#check the model
'''
no_grad() = as test data will not be used for optimization,
we do not need to calculate gradient for it
'''
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1, n_input).float())
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ",round(correct/total, 4))

Accuracy:  0.2849
