In [None]:
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler , LabelEncoder
from scipy.sparse import csr_matrix

In [None]:
df_movie=pd.read_csv("/content/movies.csv")
df_rating=pd.read_csv("/content/ratings.csv")


In [None]:
df_movie.tail()

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [None]:
df_rating.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:

df_rating.drop("timestamp",axis=1,inplace=True)


In [None]:
df_rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
import re

def clean_words(x):

    return re.sub("[^a-zA-Z0-9 ]", "", x)
def no_the(x):
  updated_text = re.sub(r'\bthe\s+(?=number)', '', x, flags=re.IGNORECASE)

    # Check if the original text had 'the' before 'number' and move it to the beginning
  if re.search(r'\bthe\s+number', x, re.IGNORECASE):
        updated_text = "The " + updated_text.lstrip()

  return updated_text

In [None]:
df_movie['genres'] = df_movie['genres'].str.replace('|', ' ')
df_movie['title'] = df_movie['title'].apply(clean_words).apply(no_the)

df_movie = df_movie[['movieId', 'title', 'genres']]
print(df_movie.head())

   movieId                             title  \
0        1                    Toy Story 1995   
1        2                      Jumanji 1995   
2        3             Grumpier Old Men 1995   
3        4            Waiting to Exhale 1995   
4        5  Father of the Bride Part II 1995   

                                        genres  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


In [None]:
merged_df = df_rating.merge(df_movie, on='movieId')

# Preprocessing to reduce data points

In [None]:
pivot_table_1 = merged_df.pivot_table(index = ["title"],columns = ["userId"],values = "rating").fillna(0)
# turn all NaN in 0
pivot_table_1.fillna(0,inplace=True)
pivot_table_1.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
nous la libert Freedom for Us 1931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00 Schneider Jagd auf Nihil Baxter 1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cent Pistol 2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane 2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [None]:
# Aggregating the number of users who voted and the number of movies that were voted.
no_user_voted = merged_df.groupby('title')['rating'].agg('count')
no_movies_voted = df_rating.groupby('userId')['rating'].agg('count')



In [None]:
no_user_voted.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
nous la libert Freedom for Us 1931,1
00 Schneider Jagd auf Nihil Baxter 1994,1
10 1979,4
10 Cent Pistol 2015,2
10 Cloverfield Lane 2016,14


In [None]:
# Making the necessary modifications as per the threshold set.
pivot_table_1 = pivot_table_1.loc[no_user_voted[no_user_voted > 5].index,:]



In [None]:
#only use movies with 50 or more votes in order to use less data points with minimal effect to output
# recommended in larger data sets
import pickle
pivot_table_1=pivot_table_1.loc[:,no_movies_voted[no_movies_voted > 1].index]

with open('pivot_table.pkl', 'wb') as file:
    pickle.dump(pivot_table_1, file)
pivot_table_1.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Cloverfield Lane 2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You 1999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
10000 BC 2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians 1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians One Hundred and One Dalmatians 1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from google.colab import files

# Download the file
files.download('pivot_table.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# use sparse matrix to during training of model as it's much more efficient with both time and space complexity
sparse_matrix = csr_matrix(pivot_table_1.values)
with open('sparse_matrix.pkl', 'wb') as file:
    pickle.dump(sparse_matrix, file)


# IF you dont want to remove any data points , use this instead

In [None]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,Grumpier Old Men 1995,Comedy Romance
2,1,6,4.0,Heat 1995,Action Crime Thriller
3,1,47,5.0,Seven aka Se7en 1995,Mystery Thriller
4,1,50,5.0,Usual Suspects 1995,Crime Mystery Thriller


In [None]:
#pivot_table = merged_df.pivot_table(index = ["title"],columns = ["userId"],values = "rating").fillna(0)

In [None]:
#pivot_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
nous la libert Freedom for Us 1931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00 Schneider Jagd auf Nihil Baxter 1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cent Pistol 2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane 2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [None]:
#sparse_matrix = csr_matrix(pivot_table.values)

NameError: name 'pivot_table' is not defined

In [None]:
# Define a KNN model on cosine similarity
from sklearn.neighbors import NearestNeighbors
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute',n_neighbors=20, n_jobs=-1)

# n_neighbors=10, n_jobs=-1
# Fitting the model on our matrix
cf_knn_model.fit(sparse_matrix)

In [None]:
movie_list= pivot_table_1.index.tolist()


In [None]:
word = "Matrix"

# Searching for sentences containing the word
matching_sentences = [sentence for sentence in movie_list if word in sentence]

# Display the matching sentences
print(matching_sentences)

['Matrix Reloaded The 2003', 'Matrix Revolutions The 2003', 'Matrix The 1999']


In [None]:
selected_movie='Shrek 2 2004'


In [None]:
selected_movie_id = pivot_table_1.index.get_loc(selected_movie)


In [None]:
distances, indices = cf_knn_model.kneighbors(pivot_table_1.iloc[selected_movie_id,:].values.reshape(1,-1), n_neighbors= 10)

In [None]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(pivot_table_1.index[selected_movie_id])) # For which movies it selected
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, pivot_table_1.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Shrek 2 2004:

1: Shrek 2001, with distance of 0.32552780239620394:
2: Pirates of the Caribbean The Curse of the Black Pearl 2003, with distance of 0.3679491936680761:
3: Incredibles The 2004, with distance of 0.3730114290942368:
4: Finding Nemo 2003, with distance of 0.3749259945672617:
5: Monsters Inc 2001, with distance of 0.3771347665816299:
6: Ice Age 2002, with distance of 0.37882067305050915:
7: SpiderMan 2002, with distance of 0.42920595645456205:
8: SpiderMan 2 2004, with distance of 0.43568300472290766:
9: Star Wars Episode III  Revenge of the Sith 2005, with distance of 0.4442188463560732:


In [None]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = df_movie[df_movie['title'].str.contains(movie_name)]
    if len(movie_list):
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = merged_df[merged_df['movieId'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(sparse_matrix[movie_idx],n_neighbors=n_movies_to_reccomend+1)
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[1:]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = merged_df.iloc[val[0]]['movieId']
            idx = df_movie[df_movie['movieId'] == movie_idx].index
            recommend_frame.append({'Title':df_movie.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"