# Movie Recommendation System (TMDB Dataset)

This notebook builds a content-based movie recommender using:
- tag generation (genres, keywords, cast)
- TF-IDF / CountVectorizer
- dimensionality reduction (Truncated SVD)
- KNN similarity search


In [211]:
import numpy as np
import pandas as pd

In [213]:
movies = pd.read_csv("TMDB  IMDB Movies Dataset.csv")

In [214]:
movies.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2662142,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."


In [217]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433394 entries, 0 to 433393
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    433394 non-null  int64  
 1   title                 433394 non-null  object 
 2   vote_average          433394 non-null  float64
 3   vote_count            433394 non-null  int64  
 4   status                433394 non-null  object 
 5   release_date          414851 non-null  object 
 6   revenue               433394 non-null  int64  
 7   runtime               433394 non-null  int64  
 8   adult                 433394 non-null  bool   
 9   backdrop_path         183916 non-null  object 
 10  budget                433394 non-null  int64  
 11  homepage              54230 non-null   object 
 12  tconst                433394 non-null  object 
 13  original_language     433394 non-null  object 
 14  original_title        433394 non-null  object 
 15  

In [219]:
#id
#title
#overview
#genres
#production_countries
#keywords
#director
#writer
#cast
#spoken_languages


In [221]:
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'spoken_languages', 'production_countries', 'cast', 'directors', 'writers']]

In [223]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,spoken_languages,production_countries,cast,directors,writers
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...","English, French, Japanese, Swahili","United Kingdom, United States of America","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan,Christopher Nolan


In [225]:
movies.size

4333940

In [227]:
movies.isnull().sum()

id                           0
title                        0
overview                 41524
genres                   77620
keywords                262713
spoken_languages        101733
production_countries    111974
cast                     68452
directors                10292
writers                  66688
dtype: int64

In [229]:
movies.dropna(inplace= True)

In [231]:
movies.shape

(114073, 10)

In [233]:
movies.duplicated().sum()

0

In [235]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,spoken_languages,production_countries,cast,directors,writers
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...","English, French, Japanese, Swahili","United Kingdom, United States of America","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan,Christopher Nolan


In [237]:
movies.shape

(114073, 10)

In [239]:
print(movies.dtypes)

id                       int64
title                   object
overview                object
genres                  object
keywords                object
spoken_languages        object
production_countries    object
cast                    object
directors               object
writers                 object
dtype: object


In [241]:
import ast
def convert(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 2:
            L.append(i)
            counter += 1
        else:
            break
    return L

In [243]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['genres'] = movies['genres'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['cast'] = movies['cast'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['directors'] = movies['directors'].apply(lambda x: [i.strip() for i in x.split(",")])
movies['writers'] = movies['writers'].apply(lambda x: [i.strip() for i in x.split(",")])

In [244]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,spoken_languages,production_countries,cast,directors,writers
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[English, French, Japanese, Swahili]","[United Kingdom, United States of America]","[Leonardo DiCaprio, Joseph Gordon-Levitt, Ken ...",[Christopher Nolan],[Christopher Nolan]


In [247]:
movies["cast"] = movies["cast"].apply(lambda x: x[:2])

In [249]:
movies.cast[0]

['Leonardo DiCaprio', 'Joseph Gordon-Levitt']

In [251]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['production_countries'] = movies['production_countries'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['directors'] = movies['directors'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['writers'] = movies['writers'].apply(lambda x:[i.replace(" ", "") for i in x])

In [252]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,spoken_languages,production_countries,cast,directors,writers
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, ScienceFiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[English, French, Japanese, Swahili]","[UnitedKingdom, UnitedStatesofAmerica]","[LeonardoDiCaprio, JosephGordon-Levitt]",[ChristopherNolan],[ChristopherNolan]


In [255]:
# movies.to_csv("preprocessed_movies.csv", index = False)

In [257]:
movies['tags'] =  movies['overview'] + movies['genres'] + movies['keywords'] + movies['spoken_languages'] + movies['production_countries'] +  movies['cast'] + movies['directors'] + movies['writers']

In [259]:
movies.tags[1]

['The',
 'adventures',
 'of',
 'a',
 'group',
 'of',
 'explorers',
 'who',
 'make',
 'use',
 'of',
 'a',
 'newly',
 'discovered',
 'wormhole',
 'to',
 'surpass',
 'the',
 'limitations',
 'on',
 'human',
 'space',
 'travel',
 'and',
 'conquer',
 'the',
 'vast',
 'distances',
 'involved',
 'in',
 'an',
 'interstellar',
 'voyage.',
 'Adventure',
 'Drama',
 'ScienceFiction',
 'rescue',
 'future',
 'spacecraft',
 'raceagainsttime',
 'artificialintelligence(a.i.)',
 'nasa',
 'timewarp',
 'dystopia',
 'expedition',
 'spacetravel',
 'wormhole',
 'famine',
 'blackhole',
 'quantummechanics',
 'familyrelationships',
 'space',
 'robot',
 'astronaut',
 'scientist',
 'singlefather',
 'farmer',
 'spacestation',
 'curious',
 'spaceadventure',
 'timeparadox',
 'thoughtful',
 'time-manipulation',
 'fatherdaughterrelationship',
 '2060s',
 'cornfield',
 'timemanipulation',
 'complicated',
 'English',
 'UnitedKingdom',
 'UnitedStatesofAmerica',
 'MatthewMcConaughey',
 'AnneHathaway',
 'ChristopherNolan',
 

In [267]:
new_df = movies[['id', 'title', 'tags']]

In [269]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())


In [271]:
new_df['tags'][0]

'cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious. action sciencefiction adventure rescue mission dream airplane paris france virtualreality kidnapping philosophy spy allegory manipulation carcrash heist memory architecture losangeles california dreamworld subconscious english french japanese swahili unitedkingdom unitedstatesofamerica leonardodicaprio josephgordon-levitt christophernolan christophernolan'

In [273]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [275]:
#stemming
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [277]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [279]:
new_df['tags'][0]

'cobb, a skill thief who commit corpor espionag by infiltr the subconsci of hi target is offer a chanc to regain hi old life as payment for a task consid to be impossible: "inception", the implant of anoth person\' idea into a target\' subconscious. action sciencefict adventur rescu mission dream airplan pari franc virtualr kidnap philosophi spi allegori manipul carcrash heist memori architectur losangel california dreamworld subconsci english french japanes swahili unitedkingdom unitedstatesofamerica leonardodicaprio josephgordon-levitt christophernolan christophernolan'

In [281]:
new_df.size

342219

In [283]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features = 3000, stop_words= 'english')
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
vectors = tfidf.fit_transform(new_df['tags'])


In [284]:
#feature extraction
# vectors = cv.fit_transform(new_df['tags'])

In [285]:
from scipy.sparse import csr_matrix
vectors = csr_matrix(vectors)

In [289]:
# len(cv.get_feature_names_out())
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1200)
reduced_vectors = svd.fit_transform(vectors)

In [291]:
# from sklearn.metrics.pairwise import cosine_similarity
# similarity = cosine_similarity(vectors, dense_output=False)
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=15)
knn.fit(reduced_vectors)

In [293]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import TruncatedSVD
# from scipy.sparse import csr_matrix

# # Step 1: Use TF-IDF instead of CountVectorizer
# tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
# vectors = tfidf.fit_transform(new_df['tags'])

# # Step 2: Convert to sparse matrix (saves memory)
# vectors = csr_matrix(vectors)

# # Step 3: Reduce dimensions using TruncatedSVD (PCA)
# svd = TruncatedSVD(n_components=500)
# reduced_vectors = svd.fit_transform(vectors)

# # Step 4: Compute cosine similarity
# from sklearn.metrics.pairwise import cosine_similarity
# similarity = cosine_similarity(reduced_vectors)


In [295]:
# knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
# knn.fit(reduced_vectors)

In [298]:

# def recommend(movie):
#     if movie not in new_df['title'].values:
#         print("Movie not found in the dataset.")
#         return

#     movie_index = new_df.loc[new_df['title'] == movie].index[0]
#     distances, indices = knn.kneighbors([reduced_vectors[movie_index]])

#     print("\nRecommended Movies:")
#     for idx in indices[0][1:]:
#         rec_title = new_df.iloc[idx]['title']
#         print(rec_title)


In [300]:
def recommend(movie, top_n=10):
    """
    Return a list of top_n recommended movie titles similar to `movie`.
    """
    if not isinstance(movie, str):
        return []

    # robust matching: use lowercase stripped titles
    movie_clean = movie.strip().lower()
    titles_lower = new_df['title'].str.strip().str.lower()

    if movie_clean not in titles_lower.values:
        # return empty list instead of printing so it's API-friendly
        return []

    movie_index = titles_lower[titles_lower == movie_clean].index[0]

    # Ensure knn and reduced_vectors are defined
    if 'knn' not in globals() or 'reduced_vectors' not in globals():
        raise RuntimeError("knn or reduced_vectors not found. Run the preprocessing cells first.")

    distances, indices = knn.kneighbors([reduced_vectors[movie_index]])
    rec_indices = indices[0][1: top_n + 1]
    recommendations = [new_df.iloc[idx]['title'] for idx in rec_indices]
    return recommendations

# example usage
# print(recommend("The Dark Knight", top_n=5))


In [302]:
recommend('Tenet')

['Damascus Cover',
 'Spy Kids: All the Time in the World',
 'Love Sonia',
 'Spy Capital',
 'The Doll Squad',
 'Double O Blonde',
 'Our Man Flint',
 'Sixteen',
 'Beyond the Call to Duty',
 'Secret Agent']

In [306]:
# ensure required objects exist
# print("knn present:", 'knn' in globals())
# print("reduced_vectors present:", 'reduced_vectors' in globals())

# heck a sample title exists
# print(new_df['title'].iloc[:10].tolist())

# test recommended function
recommend("The Dark Knight", top_n=5)


['Batman',
 'Batman Forever',
 'Batman: Under the Red Hood',
 'Batman Begins',
 'Batman: Mask of the Phantasm']