In [68]:
import numpy as np
import pandas as pd
import ast

import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from typing import List


In [69]:
# 1. Load and merge data
movies = pd.read_csv('TMDB_movie_dataset_v11.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')

In [70]:
movies.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,movie_id,cast,crew
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",27205,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [71]:
movies.shape

(16294, 27)

In [72]:
# 2. Keep relevant columns
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'tagline', 'cast', 'crew']]


In [73]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [74]:
# 3. Handle missing values FIRST
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['genres'] = movies['genres'].fillna('')
movies['keywords'] = movies['keywords'].fillna('')

In [75]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [76]:
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
tagline     0
cast        0
crew        0
dtype: int64

In [77]:

#movies.dropna(inplace=True)

In [78]:
movies.drop_duplicates(subset='title', inplace=True)


In [79]:
#movies.duplicated().sum()

In [80]:
movies.head(1)


Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [81]:
movies.iloc[0].genres

'Action, Science Fiction, Adventure'

In [82]:
# 4. Parse JSON-like columns
def parse_json_column(obj: str) -> List[str]:
    try:
        if isinstance(obj, str) and obj.startswith('['):
            return [item["name"].strip() for item in ast.literal_eval(obj)]
        elif isinstance(obj, str):
            return [s.strip() for s in obj.split(",") if s.strip()]
        return []
    except:
        return []

movies['genres'] = movies['genres'].apply(parse_json_column)
movies['keywords'] = movies['keywords'].apply(parse_json_column)

In [83]:

# 5. Process cast and crew
def convert_cast(obj: str) -> List[str]:
    try:
        return [i['name'] for i in ast.literal_eval(obj)[:3]]
    except:
        return []

def get_director(obj: str) -> List[str]:
    try:
        return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']
    except:
        return []

In [84]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [85]:
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(get_director)


In [86]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...",Your mind is the scene of the crime.,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...",[Christopher Nolan]


In [87]:
# 6. Split overview/tagline into lists AFTER filling missing values
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tagline'] = movies['tagline'].apply(lambda x: x.split())

# 7. Clean text (remove spaces in names)
for col in ['cast', 'crew']:
    movies[col] = movies[col].apply(lambda x: [i.replace(" ", "") for i in x])

# 8. Create tags column
movies["tags"] = (
    movies["overview"] 
    + movies["tagline"] 
    + movies["genres"] 
    + movies["keywords"] 
    + movies["cast"] 
    + movies["crew"]
)

# 9. Convert tags to lowercase strings
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x).lower())

# 10. Final DataFrame
new_df = movies[['id', 'title', 'tags']]

In [88]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew,tags
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Your, mind, is, the, scene, of, the, crime.]","[LeonardoDiCaprio, JosephGordon-Levitt, EllenP...",[ChristopherNolan],"cobb, a skilled thief who commits corporate es..."


In [89]:
# Fill NA for overview/tagline, no splitting needed
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')

In [90]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew,tags
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Your, mind, is, the, scene, of, the, crime.]","[LeonardoDiCaprio, JosephGordon-Levitt, EllenP...",[ChristopherNolan],"cobb, a skilled thief who commits corporate es..."


In [91]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew,tags
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Your, mind, is, the, scene, of, the, crime.]","[LeonardoDiCaprio, JosephGordon-Levitt, EllenP...",[ChristopherNolan],"cobb, a skilled thief who commits corporate es..."


In [92]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew,tags
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Your, mind, is, the, scene, of, the, crime.]","[LeonardoDiCaprio, JosephGordon-Levitt, EllenP...",[ChristopherNolan],"cobb, a skilled thief who commits corporate es..."


In [95]:

# Final dataframe
new_df = movies[['id', 'title', 'tags']]


In [96]:
new_df.head(1)

Unnamed: 0,id,title,tags
0,27205,Inception,"cobb, a skilled thief who commits corporate es..."


In [97]:
new_df.head(1)

Unnamed: 0,id,title,tags
0,27205,Inception,"cobb, a skilled thief who commits corporate es..."


In [98]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [99]:
new_df.head()

Unnamed: 0,id,title,tags
0,27205,Inception,"cobb, a skilled thief who commits corporate es..."
3,157336,Interstellar,the adventures of a group of explorers who mak...
4,155,The Dark Knight,batman raises the stakes in his war on crime. ...
6,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
11,24428,The Avengers,when an unexpected enemy emerges and threatens...


In [100]:
# Vectorization
cv = CountVectorizer(max_features=1000000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [101]:
# Similarity matrix
similarity = cosine_similarity(vectors)

In [102]:
def recommend(movie):
    movie = movie.strip()
    if movie not in new_df['title'].values:
        return []

    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = list(enumerate(similarity[movie_index]))
    recommended = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]
    return [new_df.iloc[i[0]].title for i in recommended]


In [106]:
    print(f"\nTop 5 movies similar to ")
    
        
        
        
        
    recommendations = recommend("Interstellar")
    print(recommendations)



Top 5 movies similar to 
['Space Chimps', 'Aliens', 'Alien', 'Soldier', 'Mission to Mars']


In [107]:
# Save files
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))