In [1]:
import pandas as pd
import numpy as np


In [2]:
movies_df=pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId':'int32','title':'str'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
ratings_df=pd.read_csv('ratings.csv')

In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Merge Movies & Ratings

In [6]:
movies_ratings=ratings_df.merge(movies_df,on='movieId')

In [7]:
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


### Getting total number of ratings for each movies

In [8]:
new_movies_rating=movies_ratings.dropna(axis=0,subset=['title'])
movies_ratings_count=new_movies_rating.groupby(by='title')['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'})

In [9]:
movies_ratings_count.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [10]:
#movies_ratings_count.sort_values(by='totalRatingCount',ascending=False).head()

In [11]:
new_movies_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


### Merge Movies & its Rating count

In [12]:
combine_movies_ratingcount=new_movies_rating.merge(movies_ratings_count,left_on='title',right_on='title',how='inner')

In [13]:
combine_movies_ratingcount.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),215
1,5,1,4.0,847434962,Toy Story (1995),215
2,7,1,4.5,1106635946,Toy Story (1995),215
3,15,1,2.5,1510577970,Toy Story (1995),215
4,17,1,4.5,1305696483,Toy Story (1995),215


In [14]:
combine_movies_ratingcount['totalRatingCount'].describe()

count    100836.000000
mean         58.758777
std          61.965384
min           1.000000
25%          13.000000
50%          39.000000
75%          84.000000
max         329.000000
Name: totalRatingCount, dtype: float64

### Filter based on popularity threshold.I have choosen value 84 which is 75% or more

In [15]:
popularity_threshold=84

In [16]:
popularity_movies=combine_movies_ratingcount[combine_movies_ratingcount['totalRatingCount']>=popularity_threshold]

In [17]:
popularity_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),215
1,5,1,4.0,847434962,Toy Story (1995),215
2,7,1,4.5,1106635946,Toy Story (1995),215
3,15,1,2.5,1510577970,Toy Story (1995),215
4,17,1,4.5,1305696483,Toy Story (1995),215


### Create Pivot Table

In [18]:
movies_features_df=popularity_movies.pivot_table(index='title',columns='userId',values='rating').fillna(0)

In [19]:
movies_features_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0
Ace Ventura: When Nature Calls (1995),0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0
Addams Family Values (1993),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,0.0,0.0,2.5,0.0,0.0
Airplane! (1980),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
While You Were Sleeping (1995),0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,...,0.0,3.0,3.0,3.0,0.0,0.0,0.0,2.5,3.0,0.0
Who Framed Roger Rabbit? (1988),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0
Willy Wonka & the Chocolate Factory (1971),5.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0
"Wizard of Oz, The (1939)",5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,2.0,0.0,5.0,2.5,0.0,3.5


In [20]:
from scipy.sparse import csr_matrix

In [21]:
movies_features_df_matrix=csr_matrix(movies_features_df.values)

In [22]:
movies_features_df_matrix

<194x601 sparse matrix of type '<class 'numpy.float64'>'
	with 25269 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.neighbors import NearestNeighbors

In [24]:
knn_model=NearestNeighbors(metric='cosine',algorithm='brute')

In [25]:
knn_model.fit(movies_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [26]:
movies_features_df.shape

(194, 601)

In [27]:
#query_index=np.random.choice(movies_features_df.shape[0])
query_index=1

In [28]:
print(query_index)

1


In [29]:
distances,indices=knn_model.kneighbors(movies_features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors=5)

In [30]:
distances

array([[1.11022302e-15, 3.29806450e-01, 3.79567590e-01, 3.81457146e-01,
        3.95039960e-01]])

In [31]:
indices

array([[ 1, 58, 25, 55,  2]], dtype=int64)

In [32]:
for i in range(0,len(distances.flatten())):
    if i==0:
        print('Recommendation for Movie "{}" '.format(movies_features_df.index[query_index]))
    else:
        print('{0}: "{1}", with distance of {2} :'.format(i,movies_features_df.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendation for Movie "Ace Ventura: Pet Detective (1994)" 
1: "Dumb & Dumber (Dumb and Dumber) (1994)", with distance of 0.3298064500857152 :
2: "Batman Forever (1995)", with distance of 0.3795675902998943 :
3: "Die Hard: With a Vengeance (1995)", with distance of 0.38145714569169953 :
4: "Ace Ventura: When Nature Calls (1995)", with distance of 0.3950399602416229 :
