In [1]:
import pandas as pd
import numpy as np

In [2]:
col =  ['userId', 'movieId', 'rating', 'timestamp']
df = pd.read_csv('u.data',
                 sep='\t',names=col)

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
rating_df = df[['userId', 'movieId', 'rating']]
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [5]:
movie_titles = pd.read_csv('Movie_Id_Titles')
movies_df = movie_titles.rename(index =str, columns={"item_id":"movieId"})
movies_df.head()




Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,0,50,5,Star Wars (1977)
1,290,50,5,Star Wars (1977)
2,79,50,4,Star Wars (1977)
3,2,50,5,Star Wars (1977)
4,8,50,5,Star Wars (1977)


In [7]:
combine_movie_rating = df.dropna(axis= 0, subset=['title'])
movie_ratingCount = (combine_movie_rating.groupby(by = ['title'])['rating'].
                    count().
                    reset_index().
                    rename(columns = {'rating':'totalRatingCount'})
                    [['title','totalRatingCount']])

In [8]:
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'Til There Was You (1997),9
1,1-900 (1994),5
2,101 Dalmatians (1996),109
3,12 Angry Men (1957),125
4,187 (1997),41


In [9]:
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title
0,0,50,5,Star Wars (1977)
1,290,50,5,Star Wars (1977)
2,79,50,4,Star Wars (1977)
3,2,50,5,Star Wars (1977)
4,8,50,5,Star Wars (1977)


In [10]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,0,50,5,Star Wars (1977),584
1,290,50,5,Star Wars (1977),584
2,79,50,4,Star Wars (1977),584
3,2,50,5,Star Wars (1977),584
4,8,50,5,Star Wars (1977),584


In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   1664.000
mean      60.098
std       80.963
min        1.000
25%        7.000
50%       27.000
75%       80.250
max      584.000
Name: totalRatingCount, dtype: float64


In [12]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,0,50,5,Star Wars (1977),584
1,290,50,5,Star Wars (1977),584
2,79,50,4,Star Wars (1977),584
3,2,50,5,Star Wars (1977),584
4,8,50,5,Star Wars (1977),584


In [13]:
rating_popular_movie.shape

(84072, 5)

In [14]:
# let create pivoit table

movies_features_df = rating_popular_movie.pivot_table(index='title',
                                                      columns='userId',
                                                      values='rating').fillna(0)

In [15]:
movies_features_df.head()

userId,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days in the Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"20,000 Leagues Under the Sea (1954)",0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,4.0,0.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [16]:
from scipy.sparse import csr_matrix

movies_features_df_matrix = csr_matrix(movies_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine',algorithm='brute')
model_knn.fit(movies_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [17]:
movies_features_df.shape

(605, 944)

In [18]:
query_index = np.random.choice(movies_features_df.shape[0])

In [19]:
print(query_index)

303


In [20]:
distances,indices = model_knn.kneighbors(movies_features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors=11)

In [22]:
print(distances)
print(".......................................")
print(indices)

[[0.         0.46545244 0.48507672 0.5042195  0.52694398 0.54315578
  0.54690781 0.54780098 0.54797709 0.54844139 0.55544896]]
.......................................
[[303 250 395  60 252 370 359 459 213 248 550]]


In [26]:
for i in range(0, len(distances.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(movies_features_df.
                                                  index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, 
                                                       movies_features_df.
                                                       index[indices.flatten()[i]],
                                                      distances.flatten()[i]))
        
        

Recommendations for Kingpin (1996):

1: Happy Gilmore (1996), with distance of 0.46545244012866815:
2: Nutty Professor, The (1996), with distance of 0.4850767161427665:
3: Beavis and Butt-head Do America (1996), with distance of 0.5042194998421134:
4: Heat (1995), with distance of 0.5269439764175863:
5: Multiplicity (1996), with distance of 0.5431557796232076:
6: Mission: Impossible (1996), with distance of 0.5469078116147528:
7: Rock, The (1996), with distance of 0.5478009781369007:
8: Four Rooms (1995), with distance of 0.5479770908614205:
9: Grumpier Old Men (1995), with distance of 0.5484413944465074:
10: Time to Kill, A (1996), with distance of 0.555448962300474:
