In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

merged = movies.merge(credits, on='title')


In [11]:
import ast

def safe_parse(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    return x


In [12]:
def extract_names(x):
    x = safe_parse(x)    # ensure list of dicts
    names = []
    for item in x:
        if isinstance(item, dict) and 'name' in item:
            names.append(item['name'].replace(" ", "").lower())
    return names


In [14]:
def extract_cast(x):
    x = safe_parse(x)
    cast_list = []
    for item in x[:3]:      # top 3
        if isinstance(item, dict) and 'name' in item:
            cast_list.append(item['name'].replace(" ", "").lower())
    return cast_list


In [15]:
def extract_director(x):
    x = safe_parse(x)
    for item in x:
        if isinstance(item, dict) and item.get('job') == "Director":
            return item['name'].replace(" ", "").lower()
    return ""


In [16]:
merged['genres'] = merged['genres'].apply(extract_names)
merged['keywords'] = merged['keywords'].apply(extract_names)
merged['cast'] = merged['cast'].apply(extract_cast)
merged['director'] = merged['crew'].apply(extract_director)
merged['overview'] = merged['overview'].fillna("").apply(lambda x: x.lower().split())


In [17]:
merged['tags'] = (
    merged['overview'] +
    merged['genres'] +
    merged['keywords'] +
    merged['cast'] +
    merged['director'].apply(lambda x: [x])   # converting to a list for joining
)


In [18]:
merged['tags'] = merged['tags'].apply(lambda x: " ".join(x))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(merged['tags'])
similarity = cosine_similarity(tfidf_matrix)


In [21]:
def recommend(movie_title):
    if movie_title not in merged['title'].values:
        return f"'{movie_title}' not found."

    # Index of the movie in dataframe
    index = merged[merged['title'] == movie_title].index[0]

    # Get similarity scores for this movie
    distances = similarity[index]

    # Sort movies based on similarity (ignore the first entry = itself)
    movies_list = sorted(list(enumerate(distances)), 
                         key=lambda x: x[1], reverse=True)[1:11]

    print(f"\nTop recommendations for '{movie_title}':\n")
    for i, score in movies_list:
        print(merged.iloc[i]['title'], "-- score:", round(score, 3))


In [None]:
recommend("Inception")


In [23]:
import pickle

pickle.dump(merged, open('movies_df.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
