In [3]:
import pandas as pd
import numpy as np

# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge them on the 'title' column
movies = movies.merge(credits, on='title')

# Select the features we will use for recommendation
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

In [4]:
import ast # To convert string-of-list to an actual list

def convert(obj):
    """Extracts names from a stringified list of dictionaries."""
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def get_director(obj):
    """Finds the director's name from the crew list."""
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break # We only need one director
    return L

# Apply the functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
# For cast, let's just take the top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])
movies['crew'] = movies['crew'].apply(get_director)

# The 'overview' is a string, let's convert it to a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [5]:
# Remove spaces between names to treat "James Cameron" as one entity
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

# Create the 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new dataframe with just the necessary columns
new_df = movies[['movie_id', 'title', 'tags']]

# Convert the list of tags back into a single string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower()) # Convert to lowercase

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower()) # Convert to lowercase


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the tags
vectors = cv.fit_transform(new_df['tags']).toarray()

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix
similarity = cosine_similarity(vectors)

In [9]:
def recommend(movie):
    # Find the index of the movie
    movie_index = new_df[new_df['title'] == movie].index[0]
    
    # Get the similarity scores for that movie
    distances = similarity[movie_index]
    
    # Sort the movies based on similarity, get top 5
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
    
    # Print the titles of the recommended movies
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

# Example usage:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem
Lifeforce
Battle: Los Angeles
Predators
Aliens
Falcon Rising


In [10]:
recommend('Tangled')

Out of Inferno
Aladdin
Toy Story 3
The Princess and the Frog
Frozen
The Smurfs
Despicable Me 2
Dinosaur
Monsters vs Aliens
Shrek


In [11]:
import pickle

# Save the dataframe
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))

# Save the similarity matrix
pickle.dump(similarity, open('similarity.pkl', 'wb'))