In [25]:
import pandas as pd

In [26]:
# i'm only going to use movieid and title
movies = pd.read_csv(r'C:\Users\user\code\datasets\Movies Dataset\movies.csv', usecols = ['movieId', 'title'] )

In [27]:
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [28]:
# i am only considering userId, movieId and rating
ratings = pd.read_csv(r'C:\Users\user\code\datasets\Movies Dataset\ratings.csv', usecols = ['userId','movieId', 'rating'])
ratings
ratings.shape

(100836, 3)

In [29]:
# i want to create a feature matrix of (movieId * userId) where
# i want to use userId as column and movieId as index of the rows
# and the matrix should be filled by user ratings
# and fill NaN values with 0
movies_users = ratings.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movies_users.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from scipy.sparse import csr_matrix

In [31]:
# creating a sparse matrix
mat_movies = csr_matrix(movies_users.values)
mat_movies

<9724x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [32]:
# implementing ml algorithm
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20 )
model.fit(mat_movies)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [33]:
# importing fuzzywuzzy to speedup movie search process
from fuzzywuzzy import process

In [36]:
# method to print recommendation system
def recommend(movie_name, data, n):
    idx = process.extractOne(movie_name, movies['title'])[2] # i want only the movie index
    print('You have selected the movie: ', movies['title'][idx], 'of index: ', idx)
    print('Searching for recommendation...........................')
    distance, indices = model.kneighbors(data[idx], n_neighbors = n) # this searches for cosine similarity of given index in the sparse matrix and gives back 10 nearest cosine similarity of data[idx]
    # printing the movies
    for i in indices:
        print(movies['title'][i].where(i != idx)) # doing this to eliminate same movie matching

In [37]:
recommend('toy story', mat_movies, 10) # recommending based on user rating

You have selected the movie:  Toy Story (1995) of index:  0
Searching for recommendation...........................
0                                                     NaN
2353                                 'night Mother (1986)
418                                  Jurassic Park (1993)
615                  Independence Day (a.k.a. ID4) (1996)
224             Star Wars: Episode IV - A New Hope (1977)
314                                   Forrest Gump (1994)
322                                 Lion King, The (1994)
910     Once Upon a Time in the West (C'era una volta ...
546                            Mission: Impossible (1996)
963                                           Diva (1981)
Name: title, dtype: object
