In [5]:
# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 2️⃣ Load Data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# 3️⃣ Merge on title
movies = movies.merge(credits, on='title')

# 4️⃣ Keep useful columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# 5️⃣ Drop missing
movies.dropna(inplace=True)

# 6️⃣ Helper function to convert stringified lists
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

# 7️⃣ Process cast (take top 3 actors)
def convert_cast(text):
    L = []
    count = 0
    for i in ast.literal_eval(text):
        if count != 3:
            L.append(i['name'])
            count += 1
        else:
            break
    return L

# 8️⃣ Process director
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

# 9️⃣ Apply the processing
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

# 1️⃣0️⃣ Split overview into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# 1️⃣1️⃣ Remove spaces in names
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# 1️⃣2️⃣ Create tags column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# 1️⃣3️⃣ Convert tags list to string
new = movies[['movie_id', 'title', 'tags']]
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new['tags'] = new['tags'].apply(lambda x: x.lower())

# 1️⃣4️⃣ Vectorize tags
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new['tags']).toarray()

# 1️⃣5️⃣ Compute similarity
similarity = cosine_similarity(vectors)

# 1️⃣6️⃣ Save movie_list.pkl (id + title)
pickle.dump(new[['movie_id', 'title']], open('movie_list.pkl', 'wb'))

# 1️⃣7️⃣ Save similarity.pkl
pickle.dump(similarity, open('similarity.pkl', 'wb'))

print("✅ Files saved: movie_list.pkl & similarity.pkl")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = new['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = new['tags'].apply(lambda x: x.lower())


✅ Files saved: movie_list.pkl & similarity.pkl
