# Collaborative Filtering
#### Collaborative filtering is a popular technique used in recommendation systems


In [1]:
#import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
from scipy import sparse
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
#### The file we are going to read in :: delimited
#### 1::Toy Story (1995)::Animation|Children's|Comedy
#### 2::Jumanji (1995)::Adventure|Children's|Fantasy
#### 3::Grumpier Old Men (1995)::Comedy|Romance
#### 4::Waiting to Exhale (1995)::Comedy|Drama
#### 5::Father of the Bride Part II (1995)::Comedy

In [3]:
filepath = 'D:\\Data_Science\\Recommender systems\\ml-1m\\ml-1m\\'
filename = 'movies.dat'
columns = ['title', 'tags']
data_movie_names = pd.read_csv(filepath + filename , sep = '::', header = None , names = columns)

data_movie_names.head()

  after removing the cwd from sys.path.


Unnamed: 0,title,tags
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [4]:
data_movie_names = data_movie_names[['title']]

In [5]:
#Create a dictionary mapping movie title to index
#Also create reverse lokuip i.e. dictionary mapping index to movie id
idx_to_movie_names_dict = data_movie_names.to_dict()['title']
movie_names_to_idx_dict = {}
for idx in idx_to_movie_names_dict:
    movie_names_to_idx_dict[idx_to_movie_names_dict.get(idx)] = idx


In [6]:
filepath = 'D:\\Data_Science\\Recommender systems\\ml-1m\\ml-1m\\'
filename = 'ratings.dat'
columns = ['userid', 'movieid','ratings','timestamp']
data = pd.read_csv(filepath + filename , sep = '::', header = None , names = columns)
data.head()

  after removing the cwd from sys.path.


Unnamed: 0,userid,movieid,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
data = data[['userid', 'movieid','ratings']]
data = shuffle(data)
data.head()

Unnamed: 0,userid,movieid,ratings
258055,1579,1234,4
884635,5342,780,4
186760,1156,3831,4
657890,3965,1653,3
303438,1803,785,4


In [8]:
#The dataset contains
data.shape

(1000209, 3)

In [9]:
train_data = 0.8
train_set = data.iloc[:int(data.shape[0] * train_data)]
test_set = data.iloc[int(data.shape[0] * train_data):]
print('data : {}'.format(data.shape))
print('train_set : {}'.format(train_set.shape))
print('test_set : {}'.format(test_set.shape))

data : (1000209, 3)
train_set : (800167, 3)
test_set : (200042, 3)


In [10]:
train_set.to_csv(filepath + 'train_set.csv' , index = None)

test_set.to_csv(filepath + 'test_set.csv' , index = None)

In [24]:
#make interaction dictionary

interaction_dict = {}
cid_to_idx = {}
idx_to_cid = {}
uid_to_idx ={}
idx_to_uid = {}
cidx = 0
uidx = 0

input_file = filepath + 'train_set.csv'
with open(input_file) as fp:
    next(fp)
    for line in fp:
        row = line.split(',') 
        uid = int(row[0])
        cid = int(row[1])
        rating = float(row[2])
        if uid_to_idx.get(uid) == None :
            uid_to_idx[uid] = uidx
            idx_to_uid[uidx] = uid
            interaction_dict[uid] = {}
            uidx+=1
        
        if cid_to_idx.get(cid) == None :
            cid_to_idx[cid] = cidx
            idx_to_cid[cidx] = cid
            cidx+=1
            
        interaction_dict[uid][cid] = rating

fp.close()

In [25]:
print("unique users : {}".format(len(uid_to_idx)))
print("unique movies : {}".format(len(cid_to_idx)))

unique users : 6040
unique movies : 3680


In [26]:
#interaction_dict
row = []
column = []
values = []

for uid in interaction_dict.keys():
    for cid in interaction_dict[uid].keys():
        row.append(cid_to_idx[cid])
        column.append(uid_to_idx[uid])
        values.append(interaction_dict[uid][cid])

In [14]:
#Iteratively build sparse matrix
interaction_matrix = sparse.csr_matrix((values,(column,row)))
interaction_matrix

<6040x3680 sparse matrix of type '<class 'numpy.float64'>'
	with 800167 stored elements in Compressed Sparse Row format>

In [15]:
interaction_matrix.shape

(6040, 3680)

In [123]:
embeddings_size = [16,32,64,128,256,512,1024]
mse_list = []
st_time = time.time()
for k in embeddings_size:
    u, s, vh = sparse.linalg.svds(interaction_matrix, k=k)
    s = np.diag(s)
    predictions_matrix = np.dot(np.dot(u, s), vh)
    mse = mean_squared_error(interaction_matrix.toarray(), predictions_matrix)
    mse_list.append(mse)
    print('k :{}\t mse :{}\t time : {}'.format(k,mse,round(time.time() - st_time , 2)))

k :16	 mse :0.33327416050979375
k :32	 mse :0.31223357551803893
k :64	 mse :0.28405088354564206
k :128	 mse :0.24278177417598504
k :256	 mse :0.18423160031801383
k :512	 mse :0.1110213954529193
k :1024	 mse :0.04156350014605643


In [27]:
user_embeddings, sigma, movie_embeddings = sparse.linalg.svds(interaction_matrix, k=1024)
print('user_embeddings : {}'.format(user_embeddings.shape))
print('sigma : {}'.format(sigma.shape))
print('movie_embeddings : {}'.format(movie_embeddings.shape))

sigma = np.diag(sigma)
predictions_matrix = np.dot(np.dot(user_embeddings, sigma), movie_embeddings)
print(predictions_matrix.shape)

user_embeddings : (6040, 1024)
sigma : (1024,)
movie_embeddings : (1024, 3680)
(6040, 3680)


In [28]:
def get_predictions(user_id , n_recommendations):
    idx = uid_to_idx.get(user_id)
    print('user_id : {}\t index : {}'.format(user_id , idx))
    print('\nWATCHED MOVIES :')
    watched_ratings = interaction_dict.get(user_id)
    watched_movie_list = []
    for i in watched_ratings:
        print('movieId : {}\trating:{}\tname : {}'.format(i,watched_ratings.get(i), idx_to_movie_names_dict.get(i)))
        watched_movie_list.append(i)
        
    pred_cid_idx = np.argsort(predictions_matrix[idx])[::-1][:n_recommendations]
    pred_cid_scores = predictions_matrix[idx]
    #pred_cid_scores = pred_cid_scores[pred_cid_idx]
    print('\nRECOMMENDED MOVIES :')
    for i in pred_cid_idx:
        cid = idx_to_cid.get(i)
        rating = pred_cid_scores[i]
        name = idx_to_movie_names_dict.get(cid)
        print('movieId : {}\trating:{}\tname : {}'.format(cid,rating, name))
        
    
get_predictions(user_id = 2 , n_recommendations = 20)        

user_id : 2	 index : 1715

WATCHED MOVIES :
movieId : 3107	rating:2.0	name : Backdraft (1991)
movieId : 3068	rating:4.0	name : Verdict, The (1982)
movieId : 95	rating:2.0	name : Broken Arrow (1996)
movieId : 593	rating:5.0	name : Silence of the Lambs, The (1991)
movieId : 1188	rating:4.0	name : Strictly Ballroom (1992)
movieId : 1357	rating:5.0	name : Shine (1996)
movieId : 1193	rating:5.0	name : One Flew Over the Cuckoo's Nest (1975)
movieId : 480	rating:5.0	name : Jurassic Park (1993)
movieId : 1090	rating:2.0	name : Platoon (1986)
movieId : 920	rating:5.0	name : Gone with the Wind (1939)
movieId : 902	rating:2.0	name : Breakfast at Tiffany's (1961)
movieId : 1408	rating:3.0	name : Last of the Mohicans, The (1992)
movieId : 2353	rating:4.0	name : Enemy of the State (1998)
movieId : 3071	rating:4.0	name : Stand and Deliver (1987)
movieId : 3699	rating:2.0	name : Starman (1984)
movieId : 349	rating:4.0	name : Clear and Present Danger (1994)
movieId : 2321	rating:3.0	name : Pleasantvill

In [29]:
def get_predictions_files(user_id , n_recommendations):
    idx = uid_to_idx.get(user_id)
    
    watched_ratings = interaction_dict.get(user_id)
    watched_movie_list = []
    for i in watched_ratings:
        watched_movie_list.append(i)
        
    pred_cid_idx = np.argsort(predictions_matrix[idx])[::-1]
    recommended_cid = []
    ctr = 0
    for i in pred_cid_idx:
        cid = idx_to_cid.get(i)
        if cid not in watched_movie_list:
            recommended_cid.append(cid)
            ctr+=1
            if ctr == n_recommendations:
                break
        
    return set(recommended_cid)

In [34]:
prediction_dict = {}
for user_id in uid_to_idx:
    prediction_dict[user_id] = get_predictions_files(user_id = user_id , n_recommendations = 20 )

In [35]:
interaction_dict_test = {}
cid_to_idx_test = {}
idx_to_cid_test = {}
uid_to_idx_test ={}
idx_to_uid_test = {}
cidx = 0
uidx = 0

input_file = filepath + 'test_set.csv'
with open(input_file) as fp:
    next(fp)
    for line in fp:
        row = line.split(',') 
        uid = int(row[0])
        cid = int(row[1])
        rating = float(row[2])
        if uid_to_idx_test.get(uid) == None :
            uid_to_idx_test[uid] = uidx
            idx_to_uid_test[uidx] = uid
            interaction_dict_test[uid] = {}
            uidx+=1
        
        if cid_to_idx_test.get(cid) == None :
            cid_to_idx_test[cid] = cidx
            idx_to_cid_test[cidx] = cid
            cidx+=1
            
        interaction_dict_test[uid][cid] = rating

fp.close()

In [44]:
hits = 0
misses = 0
actual_watched = 0
total_predicted = 0
for user_id in prediction_dict:
    predicted = prediction_dict.get(user_id)
    actual_ratings = interaction_dict_test.get( user_id , 0 )
    if actual_ratings!=0:
        total_predicted += len(predicted)
        actual = set(interaction_dict_test[user_id].keys())
        actual_watched += len(actual)
        hits += len(predicted.intersection(actual))
        
    max_precision = total_predicted / actual_watched

misses  =  actual_watched - hits
print('Hits : {}\tMisses : {}\tactual_watched:{}\tactual_predicted:{}'.format(hits,misses,actual_watched,total_predicted))
print('Max Prescision :{}\tPrecision@10:{} , Recall@10:{}'.format(total_predicted/actual_watched , hits/total_predicted,hits/actual_watched ))

hits : 7907	misses : 192135	actual_watched:200042	actual_predicted:120740
Max Prescision :0.6035732496175803	Precision@10:0.06548782507868146 , Recall@10:0.039526699393127446
