# Collaborative Filtering
#### Collaborative filtering is a popular technique used in recommendation systems


In [1]:
#import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
from scipy import sparse
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
import spotlight

%matplotlib inline

In [2]:
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions

In [3]:
#### The file we are going to read in :: delimited
#### 1::Toy Story (1995)::Animation|Children's|Comedy
#### 2::Jumanji (1995)::Adventure|Children's|Fantasy
#### 3::Grumpier Old Men (1995)::Comedy|Romance
#### 4::Waiting to Exhale (1995)::Comedy|Drama
#### 5::Father of the Bride Part II (1995)::Comedy

In [4]:
filepath = 'D:\\Data_Science\\Recommender systems\\ml-1m\\ml-1m\\'
filename = 'movies.dat'
columns = ['title', 'tags']
data_movie_names = pd.read_csv(filepath + filename , sep = '::', header = None , names = columns)

data_movie_names.head()

  after removing the cwd from sys.path.


Unnamed: 0,title,tags
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [5]:
data_movie_names = data_movie_names[['title']]

In [6]:
#Create a dictionary mapping movie title to index
#Also create reverse lokuip i.e. dictionary mapping index to movie id
idx_to_movie_names_dict = data_movie_names.to_dict()['title']
movie_names_to_idx_dict = {}
for idx in idx_to_movie_names_dict:
    movie_names_to_idx_dict[idx_to_movie_names_dict.get(idx)] = idx


In [7]:
filepath = 'D:\\Data_Science\\Recommender systems\\ml-1m\\ml-1m\\'
filename = 'ratings.dat'
columns = ['userid', 'movieid','ratings','timestamp']
data = pd.read_csv(filepath + filename , sep = '::', header = None , names = columns)
data.head()

  after removing the cwd from sys.path.


Unnamed: 0,userid,movieid,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
data = data[['userid', 'movieid','ratings']]
data = shuffle(data , random_state = 7)
data.head()

Unnamed: 0,userid,movieid,ratings
989001,5972,593,5
984978,5952,2401,4
820569,4933,1805,2
373691,2181,587,2
417291,2513,1641,5


In [9]:
#The dataset contains
data.shape

(1000209, 3)

In [10]:
train_data = 0.8
train_set = data.iloc[:int(data.shape[0] * train_data)]
test_set = data.iloc[int(data.shape[0] * train_data):]
print('data : {}'.format(data.shape))
print('train_set : {}'.format(train_set.shape))
print('test_set : {}'.format(test_set.shape))

data : (1000209, 3)
train_set : (800167, 3)
test_set : (200042, 3)


In [11]:
train_set.to_csv(filepath + 'train_set.csv' , index = None)

test_set.to_csv(filepath + 'test_set.csv' , index = None)

In [12]:
#make interaction dictionary

interaction_dict = {}
cid_to_idx = {}
idx_to_cid = {}
uid_to_idx ={}
idx_to_uid = {}
cidx = 0
uidx = 0

input_file = filepath + 'train_set.csv'
with open(input_file) as fp:
    next(fp)
    for line in fp:
        row = line.split(',') 
        uid = int(row[0])
        cid = int(row[1])
        rating = float(row[2])
        if uid_to_idx.get(uid) == None :
            uid_to_idx[uid] = uidx
            idx_to_uid[uidx] = uid
            interaction_dict[uid] = {}
            uidx+=1
        
        if cid_to_idx.get(cid) == None :
            cid_to_idx[cid] = cidx
            idx_to_cid[cidx] = cid
            cidx+=1
            
        interaction_dict[uid][cid] = rating

fp.close()

In [13]:
print("unique users : {}".format(len(uid_to_idx)))
print("unique movies : {}".format(len(cid_to_idx)))

unique users : 6040
unique movies : 3679


In [14]:
#interaction_dict
row = []
column = []
values = []

for uid in interaction_dict.keys():
    for cid in interaction_dict[uid].keys():
        row.append(cid_to_idx[cid])
        column.append(uid_to_idx[uid])
        values.append(interaction_dict[uid][cid])

In [15]:
#Iteratively build sparse matrix
interaction_matrix = sparse.csr_matrix((values,(column,row)))
interaction_matrix

<6040x3679 sparse matrix of type '<class 'numpy.float64'>'
	with 800167 stored elements in Compressed Sparse Row format>

In [16]:
i = Interactions(user_ids = np.array(column), item_ids = np.array(row), ratings=np.array(values))

In [18]:
model = ImplicitFactorizationModel(loss='adaptive_hinge', 
                                    embedding_dim=32, 
                                    n_iter=10, 
                                    batch_size=256, 
                                    learning_rate=0.001, 
                                    use_cuda=True, 
                                    representation=None, 
                                    sparse=False, 
                                    random_state=None, 
                                    num_negative_samples=5)

In [19]:
model.fit(i)

In [20]:
model.predict(user_ids = 0)

array([ 1.4912128 , -0.07009716,  0.00463641, ..., -0.6851275 ,
       -0.39040613, -0.8757338 ], dtype=float32)

In [21]:
def get_predictions(user_id , n_recommendations):
    idx = uid_to_idx.get(user_id)
    print('user_id : {}\t index : {}'.format(user_id , idx))
    print('\nWATCHED MOVIES :')
    watched_ratings = interaction_dict.get(user_id)
    watched_movie_list = []
    for i in watched_ratings:
        print('movieId : {}\trating:{}\tname : {}'.format(i,watched_ratings.get(i), idx_to_movie_names_dict.get(i)))
        watched_movie_list.append(i)
    
    scores = model.predict(user_ids = idx)
    pred_cid_idx = np.argsort(scores)[::-1][:n_recommendations]
    pred_cid_scores = scores
    #pred_cid_scores = pred_cid_scores[pred_cid_idx]
    print('\nRECOMMENDED MOVIES :')
    for i in pred_cid_idx:
        cid = idx_to_cid.get(i)
        rating = pred_cid_scores[i]
        name = idx_to_movie_names_dict.get(cid)
        print('movieId : {}\trating:{}\tname : {}'.format(cid,rating, name))
        
    
get_predictions(user_id = 2 , n_recommendations = 20)        

user_id : 2	 index : 4111

WATCHED MOVIES :
movieId : 368	rating:4.0	name : Maverick (1994)
movieId : 1084	rating:3.0	name : Bonnie and Clyde (1967)
movieId : 2852	rating:3.0	name : Soldier's Story, A (1984)
movieId : 1124	rating:5.0	name : On Golden Pond (1981)
movieId : 1244	rating:3.0	name : Manhattan (1979)
movieId : 480	rating:5.0	name : Jurassic Park (1993)
movieId : 3334	rating:4.0	name : Key Largo (1948)
movieId : 3071	rating:4.0	name : Stand and Deliver (1987)
movieId : 1784	rating:5.0	name : As Good As It Gets (1997)
movieId : 2002	rating:5.0	name : Lethal Weapon 3 (1992)
movieId : 647	rating:3.0	name : Courage Under Fire (1996)
movieId : 165	rating:3.0	name : Die Hard: With a Vengeance (1995)
movieId : 1552	rating:3.0	name : Con Air (1997)
movieId : 1259	rating:5.0	name : Stand by Me (1986)
movieId : 95	rating:2.0	name : Broken Arrow (1996)
movieId : 2028	rating:4.0	name : Saving Private Ryan (1998)
movieId : 110	rating:5.0	name : Braveheart (1995)
movieId : 434	rating:2.0	n

In [22]:
def get_predictions_files(user_id , n_recommendations):
    idx = uid_to_idx.get(user_id)
    
    watched_ratings = interaction_dict.get(user_id)
    watched_movie_list = []
    for i in watched_ratings:
        watched_movie_list.append(i)
    
    scores = model.predict(user_ids = idx , item_ids = np.arange(len(cid_to_idx)))
    pred_cid_idx = np.argsort(scores)[::-1]
    recommended_cid = []
    ctr = 0
    for i in pred_cid_idx:
        cid = idx_to_cid.get(i)
        if cid not in watched_movie_list:
            recommended_cid.append(cid)
            ctr+=1
            if ctr == n_recommendations:
                break
        
    return set(recommended_cid)

In [23]:
prediction_dict = {}
for user_id in uid_to_idx:
    prediction_dict[user_id] = get_predictions_files(user_id = user_id , n_recommendations = 20 )

In [24]:
interaction_dict_test = {}
cid_to_idx_test = {}
idx_to_cid_test = {}
uid_to_idx_test ={}
idx_to_uid_test = {}
cidx = 0
uidx = 0

input_file = filepath + 'test_set.csv'
with open(input_file) as fp:
    next(fp)
    for line in fp:
        row = line.split(',') 
        uid = int(row[0])
        cid = int(row[1])
        rating = float(row[2])
        if uid_to_idx_test.get(uid) == None :
            uid_to_idx_test[uid] = uidx
            idx_to_uid_test[uidx] = uid
            interaction_dict_test[uid] = {}
            uidx+=1
        
        if cid_to_idx_test.get(cid) == None :
            cid_to_idx_test[cid] = cidx
            idx_to_cid_test[cidx] = cid
            cidx+=1
            
        interaction_dict_test[uid][cid] = rating

fp.close()

In [25]:
hits = 0
misses = 0
actual_watched = 0
total_predicted = 0
for user_id in prediction_dict:
    predicted = prediction_dict.get(user_id)
    actual_ratings = interaction_dict_test.get( user_id , 0 )
    if actual_ratings!=0:
        total_predicted += len(predicted)
        actual = set(interaction_dict_test[user_id].keys())
        actual_watched += len(actual)
        hits += len(predicted.intersection(actual))
        
    max_precision = total_predicted / actual_watched

misses  =  actual_watched - hits
print('Hits : {}\tMisses : {}\tactual_watched:{}\tactual_predicted:{}'.format(hits,misses,actual_watched,total_predicted))
print('Max Prescision :{}\tPrecision@10:{} , Recall@10:{}'.format(total_predicted/actual_watched , hits/total_predicted,hits/actual_watched ))

Hits : 34032	Misses : 166010	actual_watched:200042	actual_predicted:120700
Max Prescision :0.6033732916087622	Precision@10:0.28195526097763046 , Recall@10:0.17012427390248047
