In [214]:
import numpy as np
import pandas as pd
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

# Load datasets
movies = pd.read_csv('TMDB_movie_dataset_v11.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'tagline', 'cast', 'crew']]

# Drop missing values and duplicate titles
movies.dropna(inplace=True)
movies.drop_duplicates(subset='title', inplace=True)

# Utility functions
def convert_json(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)]
    except:
        return []

def convert_cast(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)[:3]]
    except:
        return []

def get_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except:
        return []

# Apply transformations
movies['genres'] = movies['genres'].apply(convert_json)
movies['keywords'] = movies['keywords'].apply(convert_json)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(get_director)

# Fill NA for overview/tagline, no splitting needed
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')

# Clean text
for col in ['genres', 'keywords', 'cast', 'crew']:
    movies[col] = movies[col].apply(lambda x: [i.replace(" ", "") for i in x])

# Combine all tags
movies['tags'] = movies['overview'] + ' ' + movies['tagline'] + ' ' + \
                  movies['genres'].apply(lambda x: " ".join(x)) + ' ' + \
                  movies['keywords'].apply(lambda x: " ".join(x)) + ' ' + \
                  movies['cast'].apply(lambda x: " ".join(x)) + ' ' + \
                  movies['crew'].apply(lambda x: " ".join(x))

# Final dataframe
new_df = movies[['id', 'title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Stemming
def stem(text):
    ps = PorterStemmer()
    return " ".join([ps.stem(word) for word in text.split()])

new_df['tags'] = new_df['tags'].apply(stem)

# Vectorization
cv = CountVectorizer(max_features=10000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Similarity matrix
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend(movie):
    movie = movie.strip()
    if movie not in new_df['title'].values:
        print(f"Movie '{movie}' not found in dataset.")
        return

    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = list(enumerate(similarity[movie_index]))
    recommended = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    print(f"\nTop 5 movies similar to '{movie}':")
    for i in recommended:
        print(new_df.iloc[i[0]].title)

# Example usage
recommend('Batman Begins')

# Save files
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)



Top 5 movies similar to 'Batman Begins':
Maze Runner: The Scorch Trials
Dark City
We Are Your Friends
Zombie Hunter
The Village


movies.head()