In [37]:
import pandas as pd
import numpy as np

In [38]:
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})

rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [39]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [40]:
movies_df.shape

(9742, 2)

In [41]:
rating_df.head()


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [42]:
rating_df.shape

(100836, 3)

In [43]:
#merge both dataset on basis of id
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()


Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [44]:
# remove null values
df1 = df.dropna(axis = 0, subset = ['title'])

In [45]:
# we group the rows by the title and find the number of ratings it has got
ratingCount = (df1.groupby(by = ['title'])['rating'].count())

In [46]:
ratingCount= ratingCount.reset_index().rename(columns = {'rating': 'totalRatingCount'})

In [47]:
ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [54]:
df2 = df1.merge(ratingCount, left_on = 'title', right_on = 'title', how = 'left')
df2.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [55]:
df2.describe()

Unnamed: 0,userId,movieId,rating,totalRatingCount
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,58.758777
std,182.618491,35530.987199,1.042529,61.965384
min,1.0,1.0,0.5,1.0
25%,177.0,1199.0,3.0,13.0
50%,325.0,2991.0,3.5,39.0
75%,477.0,8122.0,4.0,84.0
max,610.0,193609.0,5.0,329.0


In [56]:
# consodering only those movies whoch have totalratinCount more than threshold
threshold = 60
top_movies= df2[df2['totalRatingCount']>=threshold]
top_movies.head()


Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [57]:
top_movies.shape


(35080, 5)

In [58]:
# Pivot matrix

movie_features =top_movies.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0


In [59]:
from scipy.sparse import csr_matrix
movie_features_matrix = csr_matrix(movie_features.values)

In [60]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [61]:
movie_features.shape

(335, 603)

In [62]:
index = np.random.choice(movie_features.shape[0])
print(index)
distances, indices = knn.kneighbors(movie_features.iloc[index,:].values.reshape(1, -1), n_neighbors = 5)


317


In [63]:
movie_features.index[317]

'Unbreakable (2000)'

In [65]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features.index[index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Unbreakable (2000):

1: Spider-Man (2002), with distance of 0.4350356459617615:
2: X-Men (2000), with distance of 0.44706034660339355:
3: Signs (2002), with distance of 0.4644981026649475:
4: Gattaca (1997), with distance of 0.46653681993484497:
