In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [0]:
movies_url = "https://raw.githubusercontent.com/AkshaysNimbalkar/-100daysofMLCode/master/day15_16%2C_17/movies.dat"
ratings_url = "https://raw.githubusercontent.com/AkshaysNimbalkar/-100daysofMLCode/master/day15_16%2C_17/ratings.dat"
users_url = "https://raw.githubusercontent.com/AkshaysNimbalkar/-100daysofMLCode/master/day15_16%2C_17/users.dat"

In [7]:
movies_df = pd.read_csv(movies_url, sep='::', names=['MovieId', 'Title', 'Genre'], encoding="ISO-8859-1")  
movies_df.head()


  """Entry point for launching an IPython kernel.


Unnamed: 0,MovieId,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# Ratings are in following format:
UserId::MovieId::Rating::Timestamp

-UserId range betwen 1 and 6040
-MovieId range between 1 and 3952
-Ratings are made on sclae of 5 star (Whole star ratings only)
-Timestamp is in seconds since the epoch as returned by time(2)
-Each user has at least 20 ratings


In [8]:
ratings_df = pd.read_csv(ratings_url, sep='::', names=['UserId', 'MovieId', 'Rating', 'Timestamp'], encoding="ISO-8859-1")  
ratings_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
users_df = pd.read_csv(users_url, sep='::', names=['UserId', 'Gender', 'Age', 'Occupation', 'Zip-Code'], encoding="ISO-8859-1")  
users_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,UserId,Gender,Age,Occupation,Zip-Code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
print(movies_df.shape)
print(users_df.shape)

(3883, 3)
(6040, 5)


In [11]:
num_movies = max(movies_df.iloc[:,0]) + 1
num_users = max(users_df.iloc[:,0]) + 1

print("Num of Movies: ",num_movies)
print("Num of Users: ",num_users)

Num of Movies:  3953
Num of Users:  6041


In [15]:
# now will create a matrix where rows as users and columns as movies:

# np.zeros returns array with given shape, d-type and filled with zeros ex. np.zeros(shape, d-type, order)
# shape(num_users, num_movies), d-type = default is float, order = {'C':'F'} default row-major (C-style) or column-major (Fortran-style) 

user_ratings = np.zeros((num_users, num_movies))
user_ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [0]:
#%%timeit : 1 loop, best of 3: 1min 20s per loop

for index, row in ratings_df.iterrows():
    user_id = row["UserId"]
    movie_id = row["MovieId"]
    rating = row['Rating']
    user_ratings[user_id, movie_id] = rating

In [18]:
user_ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.]])

In [19]:
# split data into training and test set

training_set, test_set = train_test_split(user_ratings, test_size=0.2)
print(len(training_set), len(test_set))

4832 1209


In [21]:
# Converting numpy arrays(training and test set) into pytorch tensors and detect gpu

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
training_set = torch.Tensor(training_set)
test_set = torch.Tensor(test_set)
print(device)

cuda:0


In [0]:
# Defining Hyperparameters: Theses are the parameters which you can experiments in order to improve accurasy.

HIDDEN_OUTER_SIZE = 128 # no of neurons in outer hidden layer
HIDDEN_INNER_SIZE = 128 # no of neurons in inner hidden layer/bottleneck
BATCH_SIZE = 16
NUM_EPOCHS = 50 # total number of passes will make over training data
LEARNING_RATE = 0.0001 # how much weights change in each step of back-propogation: tto high you will miss optimum value too low you will never reach

# WEIGHT_DECAY (regularization): makes training bit more stable, reduce over-fitting
WEIGHT_DECAY = 0.005

In [0]:
# Define neural network(Deep encoder)

class DAE(nn.Module):
  def __init__(self):
    
    # initialize pytorch neural network module
    super(DAE, self).__init__()
    
    # FIRST argument of linear is input size and second is output size
    # We are defining 3 hidden layers and 1 output layer
    self.fc1 = nn.Linear(num_movies, HIDDEN_OUTER_SIZE)
    self.fc2 = nn.Linear(HIDDEN_OUTER_SIZE, HIDDEN_INNER_SIZE)
    self.fc3 = nn.Linear(HIDDEN_INNER_SIZE, HIDDEN_OUTER_SIZE)
    self.fc4 = nn.Linear(HIDDEN_OUTER_SIZE, num_movies)
    self.activation = nn.Sigmoid()
   
  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x

# Define Loss Function:
# This is mean squared error, but avaraged by actual number of rated movies instead of averageding by Total number of movies:

def mse_loss_masked(input, target, num_labels):
  return torch.div(torch.sum((input - target)**2), num_labels)
  



In [0]:
dae = DAE()

# Assign the model to selected devices:(CUDA or CPU)
dae.to(device)

#Adam optimizer
optimizer = optim.Adam(dae.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)


In [43]:
# Train the Model

for epoch in range(1, NUM_EPOCHS + 1):
  train_loss = 0
  
  # number of users who at least rated one movie to reduce the computation
  step = 0
  row_idx = 0
  
  while row_idx < len(training_set):
    
    # add empty dimension for batch size of 1
    input = training_set[row_idx:row_idx + BATCH_SIZE, :]
    
    # target is copy of input
    target = input.clone()
    
    # we must send input and target to correct device
    input, target = input.to(device), target.to(device)
    
    # since target is clone of input we dont need to calculate its gradient
    target.require_grad = False
    
    # number of movies where rating is not zero
    num_labels = torch.sum(target > 0)
    
    # if user has rated at least one movie:
    if num_labels > 0:
      
      # get predicted rating for this user
      output = dae(input)
      
      # Zero out the predicted ratings for any movies the user hasnt rated
      # we dont want in this loss calculation
      
      output[target == 0] = 0
      loss = mse_loss_masked(output, target, num_labels)
      loss_value = loss.detach().cpu().numpy()
      train_loss += loss_value 
      
      # bcakpropogate the loss gradient into the network:
      loss.backward()
      
      # run optimizer to update the weights
      optimizer.step()
      step += 1
    row_idx += BATCH_SIZE
  print('epoch: ' + str(epoch) + ' loss:'+ str(train_loss/step))  

epoch: 1 loss:5.575309000662621
epoch: 2 loss:3.3353951166797158
epoch: 3 loss:2.911176588756359
epoch: 4 loss:2.6960284295461037
epoch: 5 loss:2.7004466333136654
epoch: 6 loss:2.884063443600737
epoch: 7 loss:2.6708520548233134
epoch: 8 loss:2.3432257384653914
epoch: 9 loss:1.9571518720380519
epoch: 10 loss:1.8787035922341
epoch: 11 loss:1.843202328839839
epoch: 12 loss:1.7748417206947378
epoch: 13 loss:1.4886377552487202
epoch: 14 loss:1.5036247609466906
epoch: 15 loss:1.6326644783777906
epoch: 16 loss:1.463016581456393
epoch: 17 loss:1.6062133055649057
epoch: 18 loss:1.5320140344417648
epoch: 19 loss:1.534221995350541
epoch: 20 loss:1.5677885468432446
epoch: 21 loss:1.5997351305374246
epoch: 22 loss:1.670418052483868
epoch: 23 loss:1.7102988741255754
epoch: 24 loss:1.8096032130797177
epoch: 25 loss:1.7918168657662852
epoch: 26 loss:1.7865007614457844
epoch: 27 loss:1.730126184738235
epoch: 28 loss:1.6998579221845462
epoch: 29 loss:1.6600994819047434
epoch: 30 loss:1.5853737829536791


In [47]:
# Evaluate the model

test_loss = 0
step = 0

for row_idx in range(len(test_set)):
  # unsqueeze(0) adds the batch dimention to matrix (size of 1)
  
  input = test_set[row_idx,:].unsqueeze(0)
  target = input.clone()
  target.require_grad = False
  input, target = input.to(device), target.to(device)
  num_labels = torch.sum(target > 0)
  if num_labels > 0:
    step += 1
    output = dae(input)
    output[target == 0] = 0
    loss = mse_loss_masked(output, target, num_labels)
    loss_value = loss.detach().cpu().numpy()
    test_loss += loss_value
print(' Test loss:'+ str(test_loss/step))    
    
    
  
  
  

 Test loss:1.6438384054612285
