In [1]:
# import libraries

import pandas as pd
import numpy as np
import ast



In [2]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")


In [6]:
credits.head()
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [9]:
# merge the movies with credits data set
movies = movies.merge(credits, left_on='title', right_on='title')

In [10]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [11]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [12]:
# to append the genres, keywords, cast and crew columns
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
        return L

In [13]:
movies['genrees'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: [i['name']for i in ast.literal_eval(x)[:3]])  # taking only top 3 cast members


In [14]:
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])  # taking only directors

In [16]:
# tagging the movie with all the columns
movies['tags'] = movies['genrees'] + movies['keywords'] + movies['cast'] + movies['crew']

In [24]:
movies['tags']

0       [Action, culture clash, Sam Worthington, Zoe S...
1       [Adventure, ocean, Johnny Depp, Orlando Bloom,...
2       [Action, spy, Daniel Craig, Christoph Waltz, L...
3       [Action, dc comics, Christian Bale, Michael Ca...
4       [Action, based on novel, Taylor Kitsch, Lynn C...
                              ...                        
4804    [Action, united states–mexico barrier, Carlos ...
4805                                                  NaN
4806    [Comedy, date, Eric Mabius, Kristin Booth, Cry...
4807                                                  NaN
4808    [Documentary, obsession, Drew Barrymore, Brian...
Name: tags, Length: 4809, dtype: object

In [26]:
# romeve unnecessary columns because tag column is enough
movies = movies[['movie_id', 'title', 'overview', 'tags']]

In [28]:
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') # joining the tags into a single string

In [29]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())  # converting list to string

In [30]:
movies['tags']

0       action culture clash sam worthington zoe salda...
1       adventure ocean johnny depp orlando bloom keir...
2       action spy daniel craig christoph waltz léa se...
3       action dc comics christian bale michael caine ...
4       action based on novel taylor kitsch lynn colli...
                              ...                        
4804    action united states–mexico barrier carlos gal...
4805                                                     
4806    comedy date eric mabius kristin booth crystal ...
4807                                                     
4808    documentary obsession drew barrymore brian her...
Name: tags, Length: 4809, dtype: object

In [31]:
movies.head()

Unnamed: 0,movie_id,title,overview,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",action culture clash sam worthington zoe salda...
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",adventure ocean johnny depp orlando bloom keir...
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,action spy daniel craig christoph waltz léa se...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,action dc comics christian bale michael caine ...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",action based on novel taylor kitsch lynn colli...


In [32]:
# cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])



In [33]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)



In [34]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies.index[movies['title'] == title][0]  # get the index of the movie
    sim_scores = list(enumerate(cosine_sim[idx]))  # get the similarity scores for the movie
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # sort the scores in descending order
    sim_scores = sim_scores[1:11]  # get the top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]  # get the indices of the similar movies
    return movies['title'].iloc[movie_indices]  # return the titles of the similar movies

In [35]:
print(get_recommendations('The Dark Knight Rises'))  # example usage

65                              The Dark Knight
1197                               The Prestige
119                               Batman Begins
72                                Suicide Squad
41                                Green Lantern
3859    Batman: The Dark Knight Returns, Part 2
9            Batman v Superman: Dawn of Justice
163                                    Watchmen
224                                     RoboCop
873                                       Shaft
Name: title, dtype: object


In [37]:
# to create streamlit app we need to import pickle
import pickle
with open('movies.pkl', 'wb') as file:
     # save the movies data
     pickle.dump((movies, cosine_sim), file)  # save the movies data