In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings_df = pd.read_csv("ratings.csv")

In [3]:
ratings_df.shape # rows & columns

(100836, 4)

In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies_df = pd.read_csv("movies.csv")

In [6]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies_df.shape

(9742, 3)

In [8]:
movies_df.movieId = movies_df.movieId.astype(np.int64)

In [9]:
type(movies_df.movieId[0])

numpy.int64

In [10]:
ratings_df.movieId.isin(movies_df.movieId).sum()

100836

In [11]:
ratings_df = movies_df.merge(ratings_df, on='movieId')
ratings_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [12]:
ratings_df.drop(['timestamp'],axis=1,inplace=True)

In [13]:
ratings_df.shape

(100836, 5)

In [14]:
ratings_df.sample(5)

Unnamed: 0,movieId,title,genres,userId,rating
76080,8376,Napoleon Dynamite (2004),Comedy,63,2.5
91782,80489,"Town, The (2010)",Crime|Drama|Thriller,177,3.5
94577,95510,"Amazing Spider-Man, The (2012)",Action|Adventure|Sci-Fi|IMAX,21,3.5
25543,1204,Lawrence of Arabia (1962),Adventure|Drama|War,372,5.0
30541,1370,Die Hard 2 (1990),Action|Adventure|Thriller,418,4.0


In [15]:
ratings_df.isnull().sum()

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64

In [16]:
ratings_count = ratings_df.groupby(by="title")['rating'].count().reset_index().rename(columns={'rating':'totalRatings'})[['title','totalRatings']]

In [17]:
ratings_count.shape

(9719, 2)

In [18]:
len(ratings_df['title'].unique())

9719

In [19]:
ratings_count.sample(5)

Unnamed: 0,title,totalRatings
2711,Elsa & Fred (2014),1
972,Beowulf (1999),1
4066,House Party 2 (1991),4
6599,Phil Spector (2013),1
7624,Shining Through (1992),1


In [20]:
ratings_count.head()

Unnamed: 0,title,totalRatings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [21]:
ratings_df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [22]:
ratings_total = pd.merge(ratings_df,ratings_count,on='title',how='left')

In [23]:
ratings_total.shape

(100836, 6)

In [24]:
ratings_total.head()

Unnamed: 0,movieId,title,genres,userId,rating,totalRatings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215


In [25]:
ratings_count['totalRatings'].describe()

count    9719.000000
mean       10.375141
std        22.406220
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
Name: totalRatings, dtype: float64

In [26]:
ratings_count['totalRatings'].quantile(np.arange(.6,1,0.01))

0.60      4.00
0.61      4.00
0.62      4.00
0.63      5.00
0.64      5.00
0.65      5.00
0.66      5.00
0.67      6.00
0.68      6.00
0.69      6.00
0.70      7.00
0.71      7.00
0.72      7.00
0.73      8.00
0.74      8.00
0.75      9.00
0.76      9.00
0.77     10.00
0.78     10.00
0.79     11.00
0.80     12.00
0.81     13.00
0.82     13.00
0.83     14.00
0.84     16.00
0.85     17.00
0.86     18.00
0.87     20.00
0.88     22.00
0.89     24.00
0.90     27.00
0.91     30.00
0.92     33.56
0.93     38.00
0.94     42.00
0.95     47.00
0.96     55.00
0.97     64.46
0.98     83.00
0.99    114.64
Name: totalRatings, dtype: float64

In [27]:
votes_count_threshold = 60

In [28]:
ratings_top = ratings_total.query('totalRatings > @votes_count_threshold')

In [29]:
ratings_top.shape

(34660, 6)

In [30]:
ratings_top.head()

Unnamed: 0,movieId,title,genres,userId,rating,totalRatings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215


In [31]:
if not ratings_top[ratings_top.duplicated(['userId','title'])].empty:
    ratings_top = ratings_top.drop_duplicates(['userId','title'])

In [32]:
ratings_top.shape

(34660, 6)

In [33]:
df_for_knn = ratings_top.pivot(index='title',columns='userId',values='rating').fillna(0)

In [34]:
df_for_knn.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0


In [35]:
df_for_knn.shape

(328, 603)

In [36]:
from scipy.sparse import csr_matrix

In [37]:
df_for_knn_sparse = csr_matrix(df_for_knn.values)

In [38]:
from sklearn.neighbors import NearestNeighbors

In [39]:
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')

In [40]:
model_knn.fit(df_for_knn_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [41]:
query_index = np.random.choice(df_for_knn.shape[0])

In [42]:
distances, indices = model_knn.kneighbors(df_for_knn.loc['Star Wars: Episode VI - Return of the Jedi (1983)'].values.reshape(1,-1),n_neighbors=6)

In [43]:
distances, indices = model_knn.kneighbors(df_for_knn.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)

In [44]:
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendations for movie: {0}\n".format(df_for_knn.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}".format(i,df_for_knn.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for movie: Crow, The (1994)

1: Batman (1989), with distance of 0.48159505435745986
2: Demolition Man (1993), with distance of 0.5719543247309082
3: Interview with the Vampire: The Vampire Chronicles (1994), with distance of 0.5750674207045795
4: Judge Dredd (1995), with distance of 0.5802161724932346
5: Speed (1994), with distance of 0.5817999994148515
