## collaborative filtering recommender system using Nearest-Neighbours

In [13]:
import pandas as pd  
import numpy as np

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
movies = pd.read_csv('./movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies.shape

(9742, 3)

In [3]:
ratings = pd.read_csv('./ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [10]:
pivot = ratings.pivot_table(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# converting pivot table to sparse matrix
sparse_data = csr_matrix(pivot)

model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(sparse_data)

In [18]:
# testing
query_index = np.random.choice(pivot.shape[0])
print('index chosen for testing : ', query_index)

index chosen for testing :  8203


In [20]:
# recommendations for movieId = 8203
distances, indices = model.kneighbors(pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)  # n_neighbors = n+1, for 'n'   number of movie recommendations

In [50]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('recommendations for movie = ', movies[movies.index == query_index]['title'].values[0], '[genre of the movie = {0}]'.format(movies[movies.index == query_index]['genres'].values[0]))
        
    else:
        title = movies[movies.index == indices.flatten()[i]]['title'].values[0]
        distance = distances.flatten()[i]
        genre = movies[movies.index == indices.flatten()[i]]['genres'].values[0]
        print(f'index = {i}\t movie recommended = {title} \t distance = {distance} \t genre = {genre}')

recommendations for movie =  Redemption (Hummingbird) (2013) [genre of the movie = Action|Crime|Thriller]
index = 1	 movie recommended = Woman, The (2011) 	 distance = 0.017527059927157085 	 genre = Horror
index = 2	 movie recommended = Iron Sky (2012) 	 distance = 0.11878613011873351 	 genre = Action|Comedy|Sci-Fi
index = 3	 movie recommended = 300: Rise of an Empire (2014) 	 distance = 0.15230471507873133 	 genre = Action|Drama|War|IMAX
index = 4	 movie recommended = World War Z (2013) 	 distance = 0.22976590568972566 	 genre = Action|Drama|Horror|IMAX
index = 5	 movie recommended = Ruby Sparks (2012) 	 distance = 0.23694254785112612 	 genre = Comedy|Fantasy|Romance


distance close to 0 means the movie is more similar as cos(0) = 1