In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer # The TfidfVectorizer is used to convert textual data into numerical vectors using tf-idf
from sklearn.metrics.pairwise import cosine_similarity # it is used to compute the similarity between documents or text representation in a vector space.

In [None]:
'''1:Vectors are identical in direction.
0:Vectors are orthogonal (no similarity)
-1:Vectors are dimetrically opposite.'''

In [3]:
df = pd.read_csv("movies (1).csv")
df

Unnamed: 0,MovieID,MovieName,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
df.head()

Unnamed: 0,MovieID,MovieName,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df.shape

(3883, 3)

In [6]:
df.isnull()

Unnamed: 0,MovieID,MovieName,Genre
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
3878,False,False,False
3879,False,False,False
3880,False,False,False
3881,False,False,False


In [7]:
df.dtypes

MovieID       int64
MovieName    object
Genre        object
dtype: object

In [8]:
df['Genre'] = df['Genre'].astype(str)

In [9]:
#Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

#transform the genres into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['Genre'])

#Display the shapes of the resulting matrix
print(tfidf_matrix.shape)

(3883, 420)


In [10]:
#compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

#Display the cosine similarity matrix
print(cosine_sim)

[[1.         0.31455749 0.20398629 ... 0.         0.         0.        ]
 [0.31455749 1.         0.         ... 0.         0.         0.        ]
 [0.20398629 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [21]:
def recommend_movies(movie_name,cosine_sim,df,top_n=3):
    #Get the index of the movie that matches the title
    idx=df[df['MovieName'] ==movie_name].index[0]
    #Get the pairwise similarity score for all movies with the chosen movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    #sort the movies based on similarity scores in desending order
    sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
    #get the top n most similar movies
    sim_scores = sim_scores[1:top_n+1]
    #get the movie indices and corresponding similarity scores
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    #return the top recommended movies
    return df['MovieName'].iloc[movie_indices].tolist(),movie_scores

recommended_movies,scores = recommend_movies(input("enter movie name"),cosine_sim,df,top_n=10)

print("Recommended Movies:")
for movie,score in zip (recommended_movies , scores):
    print(f"{movie} (Score: {score} )")

enter movie name Grumpier Old Men (1995)


Recommended Movies:
Sabrina (1995) (Score: 1.0 )
Clueless (1995) (Score: 1.0 )
Two if by Sea (1996) (Score: 1.0 )
French Twist (Gazon maudit) (1995) (Score: 1.0 )
Vampire in Brooklyn (1995) (Score: 1.0 )
If Lucy Fell (1996) (Score: 1.0 )
Boomerang (1992) (Score: 1.0 )
Pie in the Sky (1995) (Score: 1.0 )
French Kiss (1995) (Score: 1.0 )
Forget Paris (1995) (Score: 1.0 )
