In [1]:
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load CSVs
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = movies.merge(credits, on="title")

# Clean functions
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    return L

def get_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            L.append(i["name"])
            break
    return L

# Apply preprocessing
movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)
movies["cast"] = movies["cast"].apply(lambda x: [i["name"] for i in ast.literal_eval(x)][:3])
movies["crew"] = movies["crew"].apply(get_director)

movies["overview"] = movies["overview"].fillna("")
movies["tags"] = (
    movies["overview"] + " " +
    movies["genres"].astype(str) + " " +
    movies["keywords"].astype(str) + " " +
    movies["cast"].astype(str) + " " +
    movies["crew"].astype(str)
).str.lower()

# Vectorize
cv = CountVectorizer(max_features=3000, stop_words="english")
vectors = cv.fit_transform(movies["tags"]).toarray()

# Cosine similarity
similarity = cosine_similarity(vectors)

# Save as pickle
movies.reset_index(drop=True, inplace=True)
pickle.dump(movies, open("movies.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))
