#### Download dataset : https://grouplens.org/datasets/movielens/latest/

In [1]:
import pandas as pd
import numpy as np                          

In [4]:
movies_df = pd.read_csv('D:/Data/new-movie-dataset/movies.csv',usecols=['movieId','title'],
                        dtype={'movieId': 'int32', 'title': 'str'})

rating_df = pd.read_csv('D:/Data/new-movie-dataset/ratings.csv',usecols=['userId', 'movieId', 'rating'],
                      dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [5]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [6]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
# merging both DF considering 'movieId'

df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


#### Now count movie_rating , like how many person rated each movie in dataset. 

In [8]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])         # 1st drop NaN values

# Now grouping by 'title' based on 'rating' then count , reset_index bcoz not want 'title' to be index & renaming rating column 
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )

movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [12]:
# Now merge 'totalRatingCount'  with df 

rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on='title', right_on='title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [13]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [16]:
# Here we are taking rating_count for a particular movie greater than 50, we are consider those movies or recommend

popularity_threshold = 50     # taking threshold 50 for rating_count

rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [17]:
rating_popular_movie.shape

(41362, 5)

In [18]:
# First lets create a Pivot table , taking columns as userId and index as title it gives ratings acc. to title, 
# also where is NaN values fills 0 

movie_features_df = rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [19]:
# Now we convert this pivot table(matrix) into -> array matrix  using scipy 

from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)


from sklearn.neighbors import NearestNeighbors   # it is an unsupervised ML , not an KNN clssifier or Regressor, works same

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')    # model 
model_knn.fit(movie_features_df_matrix)                 # p=2 euclidean distance parameter taken

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Cosine Similarity

Determine the angle between two objects is the calculation method to the find similarity. The range of score is 0 to 1.
If score is 1, it means that they are same in orientation (not magnitude).
Cos0 =>1 , cos90 => 0  ranges.

<img src='cosine-similarity.png'>

## KNN works on - Euclidean distance

In [20]:
movie_features_df.shape

(450, 606)

In [22]:
# Suppose we take a new movie , movie_features_df.shape[0] : it will pick random record of movie id, 
# then using same model .kneighbors() : try to find which similar movies nearer to this random picked of record movie id  
# then picking the whole record of that random record , reshape it converts series into array  and get 'distances' , 'indices'.

query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)  # it shows random record it picks of movie

distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
# 'distances' gives : distance nearer to that movie , 'indices' gives movie names, here taking n_neighbors=6

248


In [23]:
for i in range(0, len(distances.flatten())):
    
    if i == 0:                                      # shows random picked record movie name
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index])) 
    
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]],
                                                       distances.flatten()[i])) 
        # shows {numbering} {movie name} & {distance of it}
        # Above taken n_neighbors=6 , shows 1st movie name and other 5 related ot it

Recommendations for Little Miss Sunshine (2006):

1: Juno (2007), with distance of 0.46614229679107666:
2: Eternal Sunshine of the Spotless Mind (2004), with distance of 0.4941146969795227:
3: Royal Tenenbaums, The (2001), with distance of 0.495472252368927:
4: Charlie and the Chocolate Factory (2005), with distance of 0.5090280771255493:
5: Donnie Darko (2001), with distance of 0.514565110206604:
