In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from surprise import Dataset, SVD

In [2]:
df = pd.read_csv('tmdb_5000_movies.csv')
df.sample(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2018,23000000,"[{""id"": 10749, ""name"": ""Romance""}, {""id"": 35, ...",http://www.aboutmary.com/,544,"[{""id"": 3262, ""name"": ""surgeon""}, {""id"": 6844,...",en,There's Something About Mary,Having never fully recovered from a prom date ...,57.110486,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1998-07-15,369884651,119.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Love Is In The Hair.,There's Something About Mary,6.5,1590
1313,38000000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 9648, ""n...",,622,"[{""id"": 242, ""name"": ""new york""}, {""id"": 1523,...",en,The Ninth Gate,An all-expenses-paid international search for ...,30.359164,"[{""name"": ""Bac Films"", ""id"": 208}, {""name"": ""K...","[{""iso_3166_1"": ""FR"", ""name"": ""France""}, {""iso...",1999-08-24,58401898,133.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Every book has a life of its own.,The Ninth Gate,6.3,756
2328,18000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name...",,279,"[{""id"": 131, ""name"": ""italy""}, {""id"": 1233, ""n...",en,Amadeus,The incredible story of genius musician Wolfga...,31.82675,"[{""name"": ""Warner Bros."", ""id"": 6194}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1984-10-26,51973029,160.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,...Everything you've heard is true,Amadeus,7.8,1076
1321,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 37, ""nam...",,13503,"[{""id"": 3508, ""name"": ""underwear""}, {""id"": 480...",en,Texas Rangers,"Ten years after the Civil War has ended, the G...",5.641978,"[{""name"": ""Dimension Films"", ""id"": 7405}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2001-11-30,0,110.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Texas Rangers,5.4,19
2933,11,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name...",,28932,[],en,F.I.S.T.,Johnny Kovak joins the Teamsters trade-union i...,3.375208,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1978-04-26,11,145.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,F.I.S.T.,6.4,29


In [3]:
df.shape

(4803, 20)

In [4]:
df.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
import ast

In [7]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L
        

In [8]:
df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)

In [9]:
df['genres'] = df['genres'].apply(lambda x: " ".join(x))
df['keywords'] = df['keywords'].apply(lambda x: " ".join(x))

In [10]:
df['overview'] = df['overview'].fillna('')
df['title'] = df['title'].fillna('')

# Now combine everything into 'tags'
df['tags'] = df['title'] + " " + df['overview'] + " " + df['genres'] + " " + df['keywords']

# Convert to lowercase
df['tags'] = df['tags'].apply(lambda x: x.lower())

In [11]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = vectorizer.fit_transform(df['tags']).toarray()

In [12]:
similar = cosine_similarity(vectors)

In [20]:
import pickle

In [24]:
pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(similar, open('similarity.pkl', 'wb'))


In [38]:
def recommend(movie):
    movie = movie.lower()
    if movie not in df['title'].str.lower().values:
        print("❌ Movie not found in dataset.")
        return
    index = df[df['title'].str.lower() == movie].index[0]
    distances = similar[index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    print(f"\n🎥 Top 5 recommendations for: {df.iloc[index].title}")
    for i in movie_list:
        print(df.iloc[i[0]].title)

# Example use
recommend("Avatar")



🎥 Top 5 recommendations for: Avatar
Alien
Moonraker
Spaceballs
Lockout
Space Chimps
