In [1]:
import pandas as pd
import numpy as np


In [2]:
##Dataset url: https://grouplens.org/datasets/movielens/latest/
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


**Merge both Dataset base on the movieId**

In [5]:
df=pd.merge(movies_df,rating_df,on='movieId')
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5




1.   Remove Missing Values
2.   Group by Movie Title and Count Ratings.<br>
    why we are  doing Group by Movie Title and Count Ratings?
*   Identify Popular Movies:

By counting the number of ratings each movie has received, you can identify which movies are more popular or widely rated. This information can be useful for various analytical purposes, such as recommending popular movies to new users






In [8]:
# this code filters out rows with missing movie titles, counts how many ratings each movie has received, and organizes this information into a DataFrame with columns 'title' and 'totalRatingCount'.

combine_movie_rating=df.dropna(axis=0,subset=['title'])
movie_rating=(combine_movie_rating.groupby(by=['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']])
movie_rating.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [9]:
# merge operation enriches the original combine_movie_rating DataFrame with the total rating count for each movie,providing additional context that can be useful for further analysis or building recommendation systems.
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [10]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_rating['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [11]:
# Assign the threshold the threshold value less than the total Rating count not show it in recommendation.
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()


Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [12]:
rating_popular_movie.shape

(41362, 5)

**Make the pivot table**<br>
**why we made?**<br>
**Create User-Item Matrices:** Organize data where rows are users, columns are items, and cells contain ratings or interactions.<br>
**Handle Sparse Data:** Efficiently manage large datasets with many missing values.<br>
**Aggregate Data:** Summarize data for easy computation of statistics like mean ratings.<br>
**prepare for Algorithms:** Essential for collaborative filtering and matrix factorization techniques.

In [14]:
movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


Converting the DataFrame to a CSR (Compressed Sparse Row) matrix before fitting the NearestNeighbors model is important for several reasons:

1. Memory Efficiency
Sparse Data Handling: Many recommendation systems deal with sparse data where most of the entries are zeros. Storing and processing such data in a dense format (standard DataFrame or NumPy array) would consume a lot of memory unnecessarily.
CSR Matrix: This format only stores non-zero entries and their indices, reducing memory usage significantly.

In [19]:
from scipy.sparse import csr_matrix
movie_features_df_matrix=csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)


In [20]:

 movie_features_df.shape

(450, 606)

In [24]:
query_index=np.random.choice( movie_features_df.shape[0])
print("Query_index:",query_index)


distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
print("Distances:",distances)
print("Indices:",indices)

Query_index: 167
Distances: [[0.         0.4449482  0.4809363  0.48249435 0.4923843  0.49345386]]
Indices: [[167 339 231 232 412 342]]


In [25]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Gangs of New York (2002):

1: Scarface (1983), with distance of 0.4449481964111328:
2: Kill Bill: Vol. 1 (2003), with distance of 0.48093628883361816:
3: Kill Bill: Vol. 2 (2004), with distance of 0.4824943542480469:
4: Training Day (2001), with distance of 0.49238431453704834:
5: School of Rock (2003), with distance of 0.49345386028289795:
