In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


movies = pd.read_csv("imdb_movies.csv")


In [2]:
movies.head(1)


Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616668.0,AU


In [3]:
movies['orig_lang'].value_counts()

orig_lang
 English                                7417
 Japanese                                714
 Spanish, Castilian                      397
 Korean                                  388
 French                                  285
 Chinese                                 153
 Cantonese                               145
 Italian                                 142
 German                                   93
 Russian                                  66
 Tagalog                                  43
 Portuguese                               35
 Thai                                     34
 Norwegian                                29
 Hindi                                    26
 Polish                                   26
 Danish                                   23
 Dutch, Flemish                           22
 Swedish                                  22
 Turkish                                  22
 Indonesian                               11
 Malayalam                                 7


In [4]:

duplicate=movies[movies.duplicated("orig_title")]
duplicate["orig_title"].value_counts()

orig_title
Pinocchio    11
Halloween     8
King Kong     8
Scream        5
After         5
             ..
 Hayat        1
Revenge       1
Cars          1
Cashback      1
Payback       1
Name: count, Length: 268, dtype: int64

In [5]:
# drop that duplicate values row
movies.drop_duplicates(subset="orig_title",inplace=True)


In [6]:
movies.isnull().sum()

names          0
date_x         0
score          0
genre         81
overview       0
crew          53
orig_title     0
status         0
orig_lang      0
budget_x       0
revenue        0
country        0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [8]:
movies.iloc[5].overview

'Inspired by a true story, an oddball group of cops, criminals, tourists and teens converge in a Georgia forest where a 500-pound black bear goes on a murderous rampage after unintentionally ingesting cocaine.'

In [9]:
# Split the 'genre' column by commas if it's a string of genres
movies['genre'] = movies['genre'].apply(lambda x: ', '.join(x.split(', ')) if pd.notnull(x) else '')

# Combine 'overview' and 'genre' into a single 'combined_features' column
movies['tags'] = movies['overview'] + ' ' + movies['genre'] + ' ' + movies['orig_title'] +' ' + movies['crew'] 

# Check the new column
movies[['tags']].head()


Unnamed: 0,tags
0,"After dominating the boxing world, Adonis Cree..."
1,Set more than a decade after the events of the...
2,"While working underground to fix a water main,..."
3,"Through a series of unfortunate events, three ..."
4,Good-hearted teenager William always lived in ...


In [10]:
movies.iloc[1].tags

"Set more than a decade after the events of the first film, learn the story of the Sully family (Jake, Neytiri, and their kids), the trouble that follows them, the lengths they go to keep each other safe, the battles they fight to stay alive, and the tragedies they endure. Science Fiction,\xa0Adventure,\xa0Action Avatar: The Way of Water Sam Worthington, Jake Sully, Zoe Saldaña, Neytiri, Sigourney Weaver, Kiri / Dr. Grace Augustine, Stephen Lang, Colonel Miles Quaritch, Kate Winslet, Ronal, Cliff Curtis, Tonowari, Joel David Moore, Norm Spellman, CCH Pounder, Mo'at, Edie Falco, General Frances Ardmore"

In [11]:
# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data
tfidf_matrix = tfidf.fit_transform(movies['tags'])

# Check the shape of the matrix (number of movies, number of features)
tfidf_matrix.shape


(9614, 73192)

In [12]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display the similarity matrix
cosine_sim.shape


(9614, 9614)

In [13]:
cosine_sim[1]

array([0.00784648, 1.        , 0.01436091, ..., 0.01285654, 0.01895282,
       0.00205972])

In [14]:
movies['index'] = range(0, len(movies))     # creating new column "index" 

movies = movies.set_index('index').reset_index() # move the column to begining of the dataset

In [15]:
def recommend(movie):
    movie = movie.strip().lower()
    movies['orig_title'] = movies['orig_title'].str.strip().str.lower()
    
    if movie not in movies['orig_title'].values:
        print(f"Movie '{movie}' not found in the DataFrame.")
        return

    movie_idx = movies[movies['orig_title'] == movie].index[0]
    
    similarity_scores = list(enumerate(cosine_sim[movie_idx]))
    sorted_scores = sorted(similarity_scores, reverse=True, key=lambda x: x[1])[1:11]
    
    print(f"Recommendations for '{movie}':")
    for i in sorted_scores:
        print(f"{movies.iloc[i[0]].orig_title} (Similarity Score: {i[1]:.4f})")

In [16]:
recommend("Harry Potter: A History Of Magic")

Recommendations for 'harry potter: a history of magic':
fantastic beasts: a natural history (Similarity Score: 0.1424)
50 greatest harry potter moments (Similarity Score: 0.1307)
harry potter and the chamber of secrets (Similarity Score: 0.1237)
harry potter and the order of the phoenix (Similarity Score: 0.1151)
harry potter and the deathly hallows: part 2 (Similarity Score: 0.1105)
harry potter and the goblet of fire (Similarity Score: 0.1087)
harry potter and the philosopher's stone (Similarity Score: 0.1025)
harry potter and the prisoner of azkaban (Similarity Score: 0.0918)
harry potter 20th anniversary: return to hogwarts (Similarity Score: 0.0841)
harry potter and the deathly hallows: part 1 (Similarity Score: 0.0783)


In [17]:
import pickle


In [18]:
pickle.dump(movies.to_dict(),open('movies.pkl','wb'))

In [19]:
pickle.dump(cosine_sim,open('similarity.pkl','wb'))