#  Get test data ready

In [580]:
import pandas as pd

import torch
import torchvision as tv
from torchvision import datasets, transforms
from torch.utils.data import DataLoader as DL

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import resample

In [581]:
#import of moviedata
cols = ["movie id","movie title","release date","video release date","IMDb URL","unknown",
        "Action","Adventure","Animation","Childrens","Comedy","Crime","Documentary",
        "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
        "Thriller","War","Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [582]:
#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [583]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#the bins are of unequal size due to repeating values in a bin
df_user['age_group'].value_counts()

(6.0, 20.0]     109
(23.0, 26.0]    105
(35.0, 40.0]    100
(31.0, 35.0]     98
(29.0, 31.0]     96
(40.0, 46.0]     94
(46.0, 51.0]     93
(20.0, 23.0]     92
(51.0, 73.0]     85
(26.0, 29.0]     71
Name: age_group, dtype: int64

# Check with test sets

In [584]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
u_train = pd.read_csv("ml-100k/u2.base",sep="\t",names=cols,header=None,encoding="ISO-8859-1")
u_test = pd.read_csv("ml-100k/u2.test",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

##keep changing above files to u2.base, u2.test, etc.

In [585]:
#join all three dataframes
training = pd.merge(pd.merge(u_train,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

testing = pd.merge(pd.merge(u_test,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

In [586]:
#categorize age_group, gender and occupation using 1-hot encoder
training['age_group'] = pd.Categorical(training['age_group'])
training['gender'] = pd.Categorical(training['gender'])
training['occupation'] = pd.Categorical(training['occupation'])

age_group_dummies = pd.get_dummies(training['age_group'])
gender_dummies = pd.get_dummies(training['gender'])
occupation_dummies = pd.get_dummies(training['occupation'])

training = pd.concat([training,
                age_group_dummies,
                gender_dummies,
                occupation_dummies], axis=1)

training.drop(['age_group',
        'gender',
        'occupation'], axis=1, inplace=True)

#verify categorization
training.head()

Unnamed: 0,user id,item id,rating,timestamp,movie id,movie title,release date,video release date,IMDb URL,unknown,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,1,3,4,878542960,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,...,0,0,0,0,0,0,0,0,1,0
1,1,4,3,876893119,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,...,0,0,0,0,0,0,0,0,1,0
2,1,5,3,889751712,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,...,0,0,0,0,0,0,0,0,1,0
3,1,6,5,887431973,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,...,0,0,0,0,0,0,0,0,1,0
4,1,7,4,875071561,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,...,0,0,0,0,0,0,0,0,1,0


In [587]:
#categorize age_group, gender and occupation using 1-hot encoder
testing['age_group'] = pd.Categorical(testing['age_group'])
testing['gender'] = pd.Categorical(testing['gender'])
testing['occupation'] = pd.Categorical(testing['occupation'])

age_group_dummies = pd.get_dummies(testing['age_group'])
gender_dummies = pd.get_dummies(testing['gender'])
occupation_dummies = pd.get_dummies(testing['occupation'])

testing = pd.concat([testing,
                age_group_dummies,
                gender_dummies,
                occupation_dummies], axis=1)

testing.drop(['age_group',
        'gender',
        'occupation'], axis=1, inplace=True)

#verify categorization
testing.head()

Unnamed: 0,user id,item id,rating,timestamp,movie id,movie title,release date,video release date,IMDb URL,unknown,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,1,1,5,874965758,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,...,0,0,0,0,0,0,0,0,1,0
1,1,2,3,876893171,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,...,0,0,0,0,0,0,0,0,1,0
2,1,8,1,875072484,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,...,0,0,0,0,0,0,0,0,1,0
3,1,9,5,878543541,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,...,0,0,0,0,0,0,0,0,1,0
4,1,21,1,878542772,21,Muppet Treasure Island (1996),16-Feb-1996,,http://us.imdb.com/M/title-exact?Muppet%20Trea...,0,...,0,0,0,0,0,0,0,0,1,0


In [588]:
#drop unneccessary features
training.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

testing.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)

In [589]:
print(training.rating.value_counts())
print(testing.rating.value_counts())

4    27294
3    21811
5    16857
2     9185
1     4853
Name: rating, dtype: int64
4    6880
3    5334
5    4344
2    2185
1    1257
Name: rating, dtype: int64


In [590]:
training.columns = [       'rating',        'Action',     'Adventure',     'Animation',
          'Childrens',        'Comedy',         'Crime',   'Documentary',
               'Drama',       'Fantasy',     'Film-Noir',        'Horror',
             'Musical',       'Mystery',       'Romance',        'Sci-Fi',
            'Thriller',           'War',       'Western', 'age_20',
          'age_23',    'age_26',    'age_29',    'age_31',
          'age_35',    'age_40',    'age_46',    'age_51',
          'age_73',             'F',             'M', 'administrator',
              'artist',        'doctor',      'educator',      'engineer',
       'entertainment',     'executive',    'healthcare',     'homemaker',
              'lawyer',     'librarian',     'marketing',          'none',
               'other',    'programmer',       'retired',      'salesman',
           'scientist',       'student',    'technician',        'writer']

training.columns

Index(['rating', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'age_20', 'age_23', 'age_26', 'age_29', 'age_31', 'age_35', 'age_40',
       'age_46', 'age_51', 'age_73', 'F', 'M', 'administrator', 'artist',
       'doctor', 'educator', 'engineer', 'entertainment', 'executive',
       'healthcare', 'homemaker', 'lawyer', 'librarian', 'marketing', 'none',
       'other', 'programmer', 'retired', 'salesman', 'scientist', 'student',
       'technician', 'writer'],
      dtype='object')

In [591]:
testing.columns = [       'rating',        'Action',     'Adventure',     'Animation',
          'Childrens',        'Comedy',         'Crime',   'Documentary',
               'Drama',       'Fantasy',     'Film-Noir',        'Horror',
             'Musical',       'Mystery',       'Romance',        'Sci-Fi',
            'Thriller',           'War',       'Western', 'age_20',
          'age_23',    'age_26',    'age_29',    'age_31',
          'age_35',    'age_40',    'age_46',    'age_51',
          'age_73',             'F',             'M', 'administrator',
              'artist',        'doctor',      'educator',      'engineer',
       'entertainment',     'executive',    'healthcare',     'homemaker',
              'lawyer',     'librarian',     'marketing',          'none',
               'other',    'programmer',       'retired',      'salesman',
           'scientist',       'student',    'technician',        'writer']

testing.columns

Index(['rating', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'age_20', 'age_23', 'age_26', 'age_29', 'age_31', 'age_35', 'age_40',
       'age_46', 'age_51', 'age_73', 'F', 'M', 'administrator', 'artist',
       'doctor', 'educator', 'engineer', 'entertainment', 'executive',
       'healthcare', 'homemaker', 'lawyer', 'librarian', 'marketing', 'none',
       'other', 'programmer', 'retired', 'salesman', 'scientist', 'student',
       'technician', 'writer'],
      dtype='object')

In [592]:
#balance the training ratings using downsampling
'''
ns = 5000
training_1 = training[training.rating == 1]
training_2 = training[training.rating == 2]
training_3 = training[training.rating == 3]
training_4 = training[training.rating == 4]
training_5 = training[training.rating == 5]

#downsample all classes except 1 to 4719
training_2_downsampled = resample(training_2,
                                 replace = False,
                                 n_samples = ns)
training_3_downsampled = resample(training_3,
                                 replace = False,
                                 n_samples = ns)
training_4_downsampled = resample(training_4,
                                 replace = False,
                                 n_samples = ns)
training_5_downsampled = resample(training_5,
                                 replace = False,
                                 n_samples = ns)

training = pd.concat([training_1, training_2_downsampled,
                     training_3_downsampled,
                     training_4_downsampled,
                     training_5_downsampled])
'''
ns = 25000
training_1 = training[training.rating == 1]
training_2 = training[training.rating == 2]
training_3 = training[training.rating == 3]
training_4 = training[training.rating == 4]
training_5 = training[training.rating == 5]

#downsample all classes to 30,000
training_1_upsampled = resample(training_1,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_2_upsampled = resample(training_2,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_3_upsampled = resample(training_3,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_4_upsampled = resample(training_4,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
training_5_upsampled = resample(training_5,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)

training = pd.concat([training_1_upsampled,
                training_2_upsampled,
                training_3_upsampled,
                training_4_upsampled,
                training_5_upsampled])
#'''
training.rating.value_counts()

5    25000
4    25000
3    25000
2    25000
1    25000
Name: rating, dtype: int64

In [593]:
#balance the testing ratings using downsampling
'''
ns = 1000
testing_1 = testing[testing.rating == 1]
testing_2 = testing[testing.rating == 2]
testing_3 = testing[testing.rating == 3]
testing_4 = testing[testing.rating == 4]
testing_5 = testing[testing.rating == 5]

#downsample all classes except 1 to 1391
testing_2_downsampled = resample(testing_2,
                                 replace = False,
                                 n_samples = ns)
testing_3_downsampled = resample(testing_3,
                                 replace = False,
                                 n_samples = ns)
testing_4_downsampled = resample(testing_4,
                                 replace = False,
                                 n_samples = ns)
testing_5_downsampled = resample(testing_5,
                                 replace = False,
                                 n_samples = ns)

testing = pd.concat([testing_1, testing_2_downsampled,
                     testing_3_downsampled,
                     testing_4_downsampled,
                     testing_5_downsampled])
'''

ns = 6500
testing_1 = testing[testing.rating == 1]
testing_2 = testing[testing.rating == 2]
testing_3 = testing[testing.rating == 3]
testing_4 = testing[testing.rating == 4]
testing_5 = testing[testing.rating == 5]

#downsample all classes to 30,000
testing_1_upsampled = resample(testing_1,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_2_upsampled = resample(testing_2,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_3_upsampled = resample(testing_3,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_4_upsampled = resample(testing_4,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)
testing_5_upsampled = resample(testing_5,
                                 replace = True,
                                 n_samples = ns,
                                 random_state=123)

testing = pd.concat([testing_1_upsampled,
                testing_2_upsampled,
                testing_3_upsampled,
                testing_4_upsampled,
                testing_5_upsampled])
#'''
testing.rating.value_counts()

5    6500
4    6500
3    6500
2    6500
1    6500
Name: rating, dtype: int64

In [594]:
#resetting the index
training.reset_index(inplace = True, drop = True)
testing.reset_index(inplace = True, drop = True)

In [595]:
print(training.shape)
print(testing.shape)

(125000, 52)
(32500, 52)


In [596]:
#prepare data for PyTorch

n_input = training.shape[1] - 1

rank_train = training['rating'].values
training_input = training.drop(["rating"], axis=1)
train = []

for index,row in training_input.iterrows():
    t = (torch.tensor(row.values), rank_train[index])
    train.append(t)
train = tuple(train)

rank_test = testing['rating'].values
testing_input = testing.drop(["rating"], axis=1)

test = []

for index,row in testing_input.iterrows():
    t = (torch.tensor(row.values), rank_test[index])
    test.append(t)
test = tuple(test)

# Neural Network 

In [597]:
#create class for the neural network

'''
fully connected layer = fc
nn.Linear(input, ouput)
initial input =  number of columns = 51
middle layers = 3 layers of 64 neurons
final output = number of ratings (0-5) = 6
'''

n_hidden_neurons = int((2*n_input/3)+6)

class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_input, n_hidden_neurons)
        self.fc2 = nn.Linear(n_hidden_neurons, n_hidden_neurons)
        #self.fc3 = nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.fc4 = nn.Linear(n_hidden_neurons, 6)

#ReLU activation function on hidden layers
#Use log_softmax for output to get probability for classes
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return F.log_softmax(x, dim=1)
    
#view created network

net = Net()
net

Net(
  (fc1): Linear(in_features=51, out_features=40, bias=True)
  (fc2): Linear(in_features=40, out_features=40, bias=True)
  (fc4): Linear(in_features=40, out_features=6, bias=True)
)

In [598]:
#divide into batches
'''
batch_size = how many inputs to pass to model at a time
shuffle = to shuffle inputs or not
'''
trainset = DL(train, batch_size=64, shuffle=True)
testset = DL(test, batch_size=64, shuffle=True)

In [599]:
#lr = learning rate = 0.001
opt = optim.Adam(net.parameters(), lr = 0.001)

#EPOCHS = number of times to iterate over dataset
EPOCHS = 10

#train the network
'''
loss = error
zero_grad() = makes gradient zero after batch
nll_loss = calculates loss to update weights
if data is 1 hot vector, use mean squared error
backward() = propogate the weights backward
opt.step() = adjusts the weights
'''
for epoch in range(EPOCHS):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net(X.view(-1, n_input).float())
        loss = F.nll_loss(output, y)
        loss.backward()
        opt.step() 
    print(loss)

tensor(1.3998, grad_fn=<NllLossBackward>)
tensor(1.5398, grad_fn=<NllLossBackward>)
tensor(1.5691, grad_fn=<NllLossBackward>)
tensor(1.6022, grad_fn=<NllLossBackward>)
tensor(1.7537, grad_fn=<NllLossBackward>)
tensor(1.2994, grad_fn=<NllLossBackward>)
tensor(1.6687, grad_fn=<NllLossBackward>)
tensor(1.5160, grad_fn=<NllLossBackward>)
tensor(1.6685, grad_fn=<NllLossBackward>)
tensor(1.3833, grad_fn=<NllLossBackward>)


In [600]:
#check the model
'''
no_grad() = as test data will not be used for optimization,
we do not need to calculate gradient for it
'''
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1, n_input).float())
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ",round(correct/total, 4))

Accuracy:  0.3122
