In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import pickle

# Data Preprocessing

# Load the movies and credits datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merging the data to combine movie and credit information based on the title
movies = movies.merge(credits, on='title', how='left')  # Ensure 'left' join to keep all movies

# Removing unnecessary columns (keeping only relevant ones)
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values in critical columns (e.g., title, genres, overview)
movies.dropna(subset=['title', 'overview', 'genres'], inplace=True)

# Function to convert string representation of a list into actual list of genres, keywords, and cast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):  # Convert string to Python literal
        L.append(i['name'])  # Extract the 'name' key
    return L

# Apply the 'convert' function to process the 'genres' and 'keywords' columns
movies['genres'] = movies['genres'].apply(lambda x: convert(x) if isinstance(x, str) else [])
movies['keywords'] = movies['keywords'].apply(lambda x: convert(x) if isinstance(x, str) else [])

# Process the 'cast' column and keep the top 3 cast members
def convert2(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter < 3:  # Only take the first 3 cast members
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(lambda x: convert2(x) if isinstance(x, str) else [])

# Function to extract the director from the 'crew' column
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':  # Check if the job is 'Director'
            L.append(i['name'])  # Append the director's name
            break
    return L

# Apply the 'fetch_director' function to process the 'crew' column
movies['crew'] = movies['crew'].apply(lambda x: fetch_director(x) if isinstance(x, str) else [])

# Preprocessing the 'overview' column and removing extra spaces (splitting into words)
movies['overview'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

# Clean genre, keyword, cast, and crew to remove extra spaces (from lists)
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x] if isinstance(x, list) else [])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x] if isinstance(x, list) else [])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x] if isinstance(x, list) else [])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x] if isinstance(x, list) else [])

# Creating a 'tags' column that combines genres, keywords, cast, and crew into one string
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with movie_id, title, and tags for the recommendation process
new_df = movies[['movie_id', 'title', 'tags']].copy()  # Create a copy to avoid warnings

# Handle the case where tags could be missing or empty
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")  # Join list into a string
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())  # Convert all tags to lowercase for uniformity

# Apply stemming directly after creating the 'tags' column (only once)
ps = PorterStemmer()

# Function to perform stemming on text (reduce words to their root form)
def stem(text):
    return " ".join([ps.stem(i) for i in text.split()])

# Use .loc to avoid SettingWithCopyWarning and apply stemming
new_df.loc[:, 'tags'] = new_df['tags'].apply(stem)

# Vectorization: Convert text data into a numerical format using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')  # Limit to top 5000 features, exclude stop words
vectors = cv.fit_transform(new_df['tags']).toarray()  # Convert tags into vectors (bag-of-words)

# Calculating Cosine Similarity between movies based on their tag vectors
similarity = cosine_similarity(vectors)  # This matrix contains similarity scores between all movies

# Function to recommend movies based on a given movie title
def recommend(movie):
    # Handle case where the movie title is missing or doesn't exist in the dataset
    if movie not in new_df['title'].values:
        return ["Movie not found or title missing."]

    movie_idx = new_df[new_df['title'] == movie].index[0]  # Get the index of the movie
    distances = similarity[movie_idx]  # Get similarity scores for the chosen movie

    # Get the top 5 similar movies excluding the input movie itself (hence starting from index 1)
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Collect the recommended movie titles
    recommended_movies = []
    for i in movie_list:
        recommended_movies.append(new_df.iloc[i[0]].title)
    return recommended_movies

# Saving the model and data for later use (useful for deploying or running later)
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

# Recommendation interaction
selected_movie = 'Avatar'  # For testing purposes, you can change it

# Get recommended movies for the selected movie
recommended_movies = recommend(selected_movie)

# Display the recommended movie names only
for movie in recommended_movies:
    print(f"Recommended Movie: {movie}")


Recommended Movie: Star Trek Into Darkness
Recommended Movie: Megaforce
Recommended Movie: Jupiter Ascending
Recommended Movie: The Lovers
Recommended Movie: Aliens
