## **Collaborative Filtering**

In [1]:
#Required packages
import numpy as np
import pandas as pd
import random
import math
from random import randrange
from numpy.linalg import norm
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
#Importing Data from Drive
train_data = pd.read_csv('/content/training_dataset.csv', sep=',', error_bad_lines=False)
test_data = pd.read_csv('/content/test_dataset.csv', sep=',', error_bad_lines=False)

In [3]:
#Creating a feature matrix for the data and obtaining the user mean value
feature_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating')
user_mean = [0 for a in range(feature_matrix.shape[0])]

In [4]:
#Creating the cosine and adjusted cosine similarity matrix
cosSimFMatrix = feature_matrix.copy().values 
adjCosSimFMatrix= feature_matrix.copy().values 
items = list(feature_matrix.columns) 
users = list(feature_matrix.index)

In [5]:
#Calculating the user mean values for the data
for i in range(0, feature_matrix.shape[0]):
  item = 0
  c = 0 
  for j in range(len(items)): 
      if not math.isnan(cosSimFMatrix[i][j]): 
          item = item + cosSimFMatrix[i][j]
          c += 1 
  if c !=0:
    user_mean[i] = item / c

In [6]:
#Calculating Similarity and Adjusted similarity matrices  
for i in range(0, feature_matrix.shape[0]): 
    for j in range(len(items)): 
      if not math.isnan(cosSimFMatrix[i][j]): 
        cosSimFMatrix[i][j]=cosSimFMatrix[i][j] 
        adjCosSimFMatrix[i][j]=adjCosSimFMatrix[i][j]- user_mean[i] 
      else: 
        cosSimFMatrix[i][j]=0 
        adjCosSimFMatrix[i][j]=0 

In [7]:
#Get similar movies
def simMov(feature_matrix, i, j):
  return (np.dot(feature_matrix[:,i],feature_matrix[:,j])/(norm(feature_matrix[:,i])*norm(feature_matrix[:,j]))) 

In [8]:
#Considering movies that are at least 50% similar
def similarMovies(feature_matrix,items,user_id,item_id):
  val=0.05 
  sim_movies = []
  for i in range(len(items)):
    if i!=item_id and feature_matrix[user_id][i] !=0: 
      sim_movies.append((simMov(feature_matrix,item_id,i),i)) 
  sim_movies.sort(reverse=True) 
  res=[]
  for j in sim_movies:
    if j[0]>=val:
      res.append(j)
    else: 
      continue 
  return res[:20] 

In [9]:
# Obtaining Cosine Similarity
def cosineSimilarity(feature_matrix,items,user_mean, user, item):
  use_index = users.index(user) 
  if item in items:
    item_index = items.index(item) 
    topsim = similarMovies(feature_matrix, items, use_index, item_index) 
    a = 0  
    b = 0
    for i in topsim:
      a += i[0] * feature_matrix[use_index][i[1]] 
      b += i[0] 
    rates = a / b 
    rates = rates 
  else:
    rates = user_mean[use_index] 
  return rates

In [10]:
# Obtaining Adjusted Cosine Similarity
def adjustedCosineSimilarity(feature_matrix,items,user_mean, user, item):
  use_index = users.index(user) 
  if item in items:
    item_index = items.index(item) 
    topsim = similarMovies(feature_matrix, items, use_index, item_index) 
    a = 0
    b = 0 
    for i in topsim:
      a += i[0] * feature_matrix[use_index][i[1]] 
      b += i[0] 
    rates = a / b 
    rates = rates + user_mean[use_index] 
  else:
    rates = user_mean[use_index] 
  return rates

In [11]:
#Test Data Values
test_ratings = test_data["rating"].values
users_id = test_data["user_id"] 
items_id = test_data["item_id"] 

In [12]:
#Obtaining cosine similarity
cosineSimilarity_pred = [] 
for i in range(len(users_id)): 
  cosineSimilarity_pred.append(cosineSimilarity(cosSimFMatrix,items,user_mean,users_id[i], items_id[i])) 

In [13]:
#Obtaining adjusted cosine similarity
adjustedCosineSimilarity_pred = [] 
for i in range(len(users_id)): 
  adjustedCosineSimilarity_pred.append(adjustedCosineSimilarity(adjCosSimFMatrix,items,user_mean,users_id[i], items_id[i])) 

In [14]:
#RMSE Calculation function obtained from project description
def RMSE(y_actual, y_predicted): 
  rms = sqrt(mean_squared_error(y_actual, y_predicted)) 
  return round(rms,4)

In [15]:
rMSECosSim = RMSE(test_ratings, cosineSimilarity_pred)
print("RMSE obtained for Cosine Similarity:",rMSECosSim)

RMSE obtained for Cosine Similarity: 0.9393


In [16]:
rMSEAdjCosSim = RMSE(test_ratings, adjustedCosineSimilarity_pred)
print("RMSE obtained for Adjusted Cosine Similarity ",rMSEAdjCosSim)

RMSE obtained for Adjusted Cosine Similarity  0.8829
