In [38]:
# import libraries :
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
'''
cosine_similarity: Measures the similarity between two vectors, used here to compute similarity between movie genres.
TfidfVectorizer: Converts textual data into numerical features
'''

'\ncosine_similarity: Measures the similarity between two vectors, used here to compute similarity between movie genres.\nTfidfVectorizer: Converts textual data into numerical features\n'

In [40]:
data = pd.read_csv("movies.csv")  # this dataset is taken from kaggle
df = pd.DataFrame(data)

In [41]:
#get all column names :
df.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')

In [42]:
df["name"] = df["name"].str.strip()
df["genre"] = df["genre"].str.strip()
# string method removes leading and trailing whitespace from each string.

print(df)

                                                name rating      genre  year  \
0                                        The Shining      R      Drama  1980   
1                                    The Blue Lagoon      R  Adventure  1980   
2     Star Wars: Episode V - The Empire Strikes Back     PG     Action  1980   
3                                          Airplane!     PG     Comedy  1980   
4                                         Caddyshack      R     Comedy  1980   
...                                              ...    ...        ...   ...   
7663                                    More to Life    NaN      Drama  2020   
7664                                     Dream Round    NaN     Comedy  2020   
7665                                   Saving Mbango    NaN      Drama  2020   
7666                                    It's Just Us    NaN      Drama  2020   
7667                                       Tee em el    NaN     Horror  2020   

                              released 

In [43]:
# now remove the all column except name and genre
df = df[["name" , "genre"]]
print(df)

                                                name      genre
0                                        The Shining      Drama
1                                    The Blue Lagoon  Adventure
2     Star Wars: Episode V - The Empire Strikes Back     Action
3                                          Airplane!     Comedy
4                                         Caddyshack     Comedy
...                                              ...        ...
7663                                    More to Life      Drama
7664                                     Dream Round     Comedy
7665                                   Saving Mbango      Drama
7666                                    It's Just Us      Drama
7667                                       Tee em el     Horror

[7668 rows x 2 columns]


#####define TF-IDF vectorizer to transform the genre text into the vector

In [44]:
tfidf = TfidfVectorizer(stop_words="english")  #(e.g., "the", "and") are ignored.

###### fit and transform the genre column into a matrix of TF-IDF feature

In [45]:
tfidf_matrix = tfidf.fit_transform(df["genre"])
'''
each row represents a movie, and each column represents a unique term from the genre column.
'''

'\neach row represents a movie, and each column represents a unique term from the genre column.\n'

#####compute the cosine similarity matrix :

In [46]:
cosine_sim = cosine_similarity(tfidf_matrix , tfidf_matrix)  #Computes pairwise cosine similarity between rows in the TF-IDF matrix.

####function to recommend movie based on cosine similarity :

In [47]:
def get_recommendations(title , cosine_sim = cosine_sim, df=df):

  title = title.lower()

  # Check if the movie title exists in the DataFrame
  if title not in df['name'].str.lower().values:
    print(f"Movie '{title}' not found in the dataset.")
    return []  # Return an empty list if movie not found


  # now get the index of movie that matches the title :
  idx = df[df["name"].str.lower() == title].index[0]


  # get the pairwise similarity source of all movie with that movie,
  #enumerate(): Converts the similarity scores into a list of tuples (index, score)
  sim_scores = list(enumerate(cosine_sim[idx]))


  # sort the movie based on similarity score
  sim_scores = sorted(sim_scores , key = lambda x : x[1], reverse = True)

  # get the indices of 2 most similar movie :
  sim_scores = sim_scores[1:3] # if 10 movie then sim_scores[1,11]

  # get the movie indices
  movie_indices = [i[0] for i in sim_scores]

  #return the title of the movie :
  return df["name"].iloc[movie_indices]

###test the recommendatioon system with an example :

In [48]:
movie_title = "More to life"
recommendations = get_recommendations(movie_title)
print(f"Recommended movie for {movie_title} : ")

for movie in recommendations:
  print(movie)

Recommended movie for More to life : 
Ordinary People
Somewhere in Time


In [49]:
movie_title = "Krish"
recommendations = get_recommendations(movie_title)
print(f"Recommended movie for {movie_title} : ")

for movie in recommendations:
  print(movie)

Movie 'krish' not found in the dataset.
Recommended movie for Krish : 
