In [1]:
#Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
import warnings
warnings.filterwarnings('ignore')

#Get data

In [2]:
movies_df=pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
rating_df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
1022822,6881,589,2.0
1022823,6881,590,4.0
1022824,6881,592,3.0
1022825,6881,593,4.0


In [4]:
#merging dataframes
merged_df=pd.merge(rating_df,movies_df,on='movieId')
merged_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,296,5.0,Pulp Fiction (1994)
1,3,296,5.0,Pulp Fiction (1994)
2,4,296,4.0,Pulp Fiction (1994)
3,5,296,4.0,Pulp Fiction (1994)
4,7,296,4.0,Pulp Fiction (1994)


In [5]:
#combining the dataframes and adding a total rating count
combine_movie_rating = merged_df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,"""Great Performances"" Cats (1998)",12
1,#realityhigh (2017),1
2,$ (Dollars) (1971),1
3,$5 a Day (2008),3
4,$9.99 (2008),3


In [6]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,296,5.0,Pulp Fiction (1994),3339
1,3,296,5.0,Pulp Fiction (1994),3339
2,4,296,4.0,Pulp Fiction (1994),3339
3,5,296,4.0,Pulp Fiction (1994),3339
4,7,296,4.0,Pulp Fiction (1994),3339


In [7]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())


count   22069.000
mean       46.347
std       163.458
min         1.000
25%         1.000
50%         3.000
75%        20.000
max      3437.000
Name: totalRatingCount, dtype: float64


In [8]:
#setting a threshold for minimum number of ratings
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,296,5.0,Pulp Fiction (1994),3339
1,3,296,5.0,Pulp Fiction (1994),3339
2,4,296,4.0,Pulp Fiction (1994),3339
3,5,296,4.0,Pulp Fiction (1994),3339
4,7,296,4.0,Pulp Fiction (1994),3339


In [9]:
#making a pivot table having users as features , movie titles as index and ratings as the values in the table
movie_features=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,6842,6843,6844,6845,6846,6847,6848,6849,6850,6851,6852,6853,6854,6855,6856,6857,6858,6859,6860,6861,6862,6863,6864,6865,6866,6867,6868,6869,6870,6871,6872,6873,6874,6875,6876,6877,6878,6879,6880,6881
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#making a list of movies
movie_list=movie_features.index
movie_list= list(movie_list)

#Recommendations

In [13]:
from scipy.sparse import csr_matrix

#making the pivot table into vectors 
movie_features_df_matrix = csr_matrix(movie_features.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [14]:
query_index = np.random.choice(movie_features.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

1692


In [15]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Laputa: Castle in the Sky (Tenkû no shiro Rapyuta) (1986):

1: Nausicaä of the Valley of the Wind (Kaze no tani no Naushika) (1984), with distance of 0.3758045434951782:
2: My Neighbor Totoro (Tonari no Totoro) (1988), with distance of 0.41556602716445923:
3: Howl's Moving Castle (Hauru no ugoku shiro) (2004), with distance of 0.46042972803115845:
4: Princess Mononoke (Mononoke-hime) (1997), with distance of 0.46517980098724365:
5: Kiki's Delivery Service (Majo no takkyûbin) (1989), with distance of 0.4704696536064148:


In [16]:
def recommend(movie):
  for i in range(len(movie_list)):
    if movie_list[i]==str(movie):
      index=i

  distances, indices = model_knn.kneighbors(movie_features.iloc[index,:].values.reshape(1, -1), n_neighbors = 6)
  return distances,indices,index

In [17]:
x,y,index=recommend('101 Dalmatians (1996)')

In [18]:
for i in range(0, len(x.flatten())):
      if i == 0:
          print('Recommendations for {0}:\n'.format(movie_features.index[index]))
      else:
          print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[y.flatten()[i]], x.flatten()[i]))

Recommendations for 101 Dalmatians (1996):

1: Space Jam (1996), with distance of 0.6617869138717651:
2: 101 Dalmatians (One Hundred and One Dalmatians) (1961), with distance of 0.6631137132644653:
3: Hunchback of Notre Dame, The (1996), with distance of 0.6892439723014832:
4: Matilda (1996), with distance of 0.698569118976593:
5: 102 Dalmatians (2000), with distance of 0.7020951509475708:
