In [18]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer as count
%matplotlib inline

In [19]:
# Load keywords and credits
credits = pd.read_csv('D:/Data_Science/Recommender systems/the-movies-dataset/credits.csv')
keywords = pd.read_csv('D:/Data_Science/Recommender systems/the-movies-dataset/keywords.csv')
metadata = pd.read_csv('D:/Data_Science/Recommender systems/the-movies-dataset/movies_metadata.csv',low_memory=False)

In [20]:
# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])
vote_average = metadata.vote_average.mean()
m_90 = metadata.vote_count.quantile(0.90)
q_movies = metadata.copy().loc[metadata['vote_count'] >= m_90]
q_movies = q_movies.reset_index()

In [21]:

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
q_movies['id'] = q_movies['id'].astype('int')


In [22]:
# Merge keywords and credits into your main metadata dataframe
q_movies = q_movies.merge(credits, on='id')
q_movies = q_movies.merge(keywords, on='id')

In [23]:
# Print the first two movies of your newly merged metadata
q_movies.head(2)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [24]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    q_movies[feature] = q_movies[feature].apply(literal_eval)

In [25]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [26]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [27]:
# Define new director, cast, genres and keywords features that are in a suitable form.
q_movies['director'] = q_movies['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    q_movies[feature] = q_movies[feature].apply(get_list)

In [28]:
# Print the new features of the first 3 films
q_movies[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [29]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [30]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    q_movies[feature] = q_movies[feature].apply(clean_data)

In [31]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
# Create a new soup feature

In [32]:
# Create a new soup feature
q_movies['soup'] = q_movies.apply(create_soup, axis=1)

In [33]:
q_movies.head()

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,soup
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgame, disappearance, basedonchildren'sbook]",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[comedy],,11862,tt0113041,en,Father of the Bride Part II,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlifecrisis, confidence]",charlesshyer,baby midlifecrisis confidence stevemartin dian...
3,5,False,,60000000,"[action, crime, drama]",,949,tt0113277,en,Heat,...,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"[alpacino, robertdeniro, valkilmer]","[{'credit_id': '52fe4292c3a36847f802916d', 'de...","[robbery, detective, bank]",michaelmann,robbery detective bank alpacino robertdeniro v...
4,8,False,,35000000,"[action, adventure, thriller]",,9091,tt0114576,en,Sudden Death,...,Terror goes into overtime.,Sudden Death,False,5.5,174.0,"[jean-claudevandamme, powersboothe, dorianhare...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...","[terrorist, hostage, explosive]",peterhyams,terrorist hostage explosive jean-claudevandamm...


In [26]:
count.get_feature_names()

['adventure',
 'animation',
 'baby',
 'basedonchildren',
 'boardgame',
 'boy',
 'charlesshyer',
 'comedy',
 'confidence',
 'dianekeaton',
 'disappearance',
 'donrickles',
 'family',
 'fantasy',
 'jealousy',
 'joejohnston',
 'johnlasseter',
 'jonathanhyde',
 'kirstendunst',
 'martinshort',
 'midlifecrisis',
 'robinwilliams',
 'sbook',
 'stevemartin',
 'timallen',
 'tomhanks',
 'toy']

In [27]:
cosine_similarity(count.fit_transform(q_movies['soup'][:3]), count.fit_transform(q_movies['soup'][:3]))

array([[1.        , 0.09534626, 0.1118034 ],
       [0.09534626, 1.        , 0.        ],
       [0.1118034 , 0.        , 1.        ]])

In [35]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(q_movies['soup'])

In [36]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [37]:
# Reset index of your main DataFrame and construct reverse mapping as before
q_movies = q_movies.reset_index()
indices = pd.Series(q_movies.index, index=q_movies['title'])

In [106]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[title]
    #print("idx",idx)
    feature_names = []
    for k in set(np.nonzero(np.array(count_matrix[idx].todense())[0])[0]):
        feature_names.append(count.get_feature_names()[k])
    print('INPUT MOVIE \nindex : {}\t name : {}\nfeatures :{}'.format(idx,title,str(feature_names)))
    print('*****************************************************************************************')
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    #print("sim_scores1",sim_scores)

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #print("sim_scores2",sim_scores)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    #print("sim_scores3",sim_scores)
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    #print(" top 10 similar movie_indices",movie_indices)
    
    # Return the top 10 most similar movies
    #return q_movies['title'].iloc[movie_indices]
    print('RECOMMENDED MOVIES')
    for i in movie_indices:
        current_movie_meta_idx = set(np.nonzero(np.array(count_matrix[idx].todense())[0])[0])
        #print(current_movie_meta_idx)
        sim_movie_meta_idx = set(np.nonzero(np.array(count_matrix[i].todense())[0])[0])
        #print(sim_movie_meta_idx)
        common_feature_idx = current_movie_meta_idx.intersection(sim_movie_meta_idx)
        #print(common_feature_idx)
        feature_names = []
        for j in common_feature_idx:
            feature_names.append(count.get_feature_names()[j])
        print ('Index : {} \t name : {}\t common_features : {}'.format(i,q_movies['title'].iloc[i],str(feature_names)))
        #return ('Index : ',i,'name : ',title,'features : ',str(feature_names))

In [107]:
get_recommendations('GoldenEye', cosine_sim2)

INPUT MOVIE 
index : 5	 name : GoldenEye
features :['action', 'thriller', 'cuba', 'piercebrosnan', 'izabellascorupco', 'secretidentity', 'adventure', 'seanbean', 'falselyaccused', 'martincampbell']
*****************************************************************************************
RECOMMENDED MOVIES
Index : 462 	 name : Dante's Peak	 common_features : ['action', 'adventure', 'piercebrosnan', 'thriller']
Index : 535 	 name : Tomorrow Never Dies	 common_features : ['action', 'adventure', 'piercebrosnan', 'thriller']
Index : 761 	 name : First Blood	 common_features : ['action', 'adventure', 'falselyaccused', 'thriller']
Index : 925 	 name : Licence to Kill	 common_features : ['action', 'adventure', 'secretidentity', 'thriller']
Index : 948 	 name : The World Is Not Enough	 common_features : ['action', 'adventure', 'piercebrosnan', 'thriller']
Index : 1460 	 name : Die Another Day	 common_features : ['action', 'adventure', 'piercebrosnan', 'thriller']
Index : 2184 	 name : Casino Ro

In [99]:
get_recommendations('Ace Ventura: When Nature Calls', cosine_sim2)

INPUT MOVIE 
index : 12	 name : Ace Ventura: When Nature Calls
features :['africa', 'steveoedekerk', 'simoncallow', 'humananimalrelationship', 'indigenous', 'crime', 'ianmcneice', 'comedy', 'adventure', 'jimcarrey']
*****************************************************************************************
RECOMMENDED MOVIES
Index : 3679 	 name : Muppets Most Wanted	 common_features : ['crime', 'comedy', 'adventure']
Index : 119 	 name : The Mask	 common_features : ['crime', 'comedy', 'jimcarrey']
Index : 491 	 name : George of the Jungle	 common_features : ['africa', 'adventure', 'comedy']
Index : 763 	 name : The Jewel of the Nile	 common_features : ['africa', 'adventure', 'comedy']
Index : 838 	 name : Austin Powers: The Spy Who Shagged Me	 common_features : ['crime', 'comedy', 'adventure']
Index : 956 	 name : Midnight Run	 common_features : ['crime', 'comedy', 'adventure']
Index : 1875 	 name : Lemony Snicket's A Series of Unfortunate Events	 common_features : ['comedy', 'jimcarrey'

In [100]:
get_recommendations('Transporter 3', cosine_sim2)

INPUT MOVIE 
index : 2527	 name : Transporter 3
features :['action', 'natalyarudakova', 'oliviermegaton', 'thriller', 'jasonstatham', 'adventure', 'françoisberléand']
*****************************************************************************************
RECOMMENDED MOVIES
Index : 3074 	 name : Killer Elite	 common_features : ['action', 'adventure', 'thriller', 'jasonstatham']
Index : 1432 	 name : The Transporter	 common_features : ['action', 'françoisberléand', 'thriller', 'jasonstatham']
Index : 3128 	 name : The Expendables 2	 common_features : ['action', 'adventure', 'thriller', 'jasonstatham']
Index : 3770 	 name : The Expendables 3	 common_features : ['action', 'adventure', 'thriller', 'jasonstatham']
Index : 2848 	 name : The Expendables	 common_features : ['action', 'adventure', 'thriller', 'jasonstatham']
Index : 1318 	 name : The One	 common_features : ['action', 'thriller', 'jasonstatham']
Index : 2270 	 name : The Librarian: Quest for the Spear	 common_features : ['actio

In [101]:
get_recommendations('The Prestige', cosine_sim2)

INPUT MOVIE 
index : 2176	 name : The Prestige
features :['drama', 'thriller', 'michaelcaine', 'christianbale', 'mystery', 'obsession', 'hughjackman', 'secret', 'christophernolan', 'competition']
*****************************************************************************************
RECOMMENDED MOVIES
Index : 888 	 name : Stir of Echoes	 common_features : ['secret', 'obsession', 'mystery', 'thriller']
Index : 1507 	 name : Spider	 common_features : ['drama', 'mystery', 'thriller', 'secret']
Index : 1785 	 name : The Village	 common_features : ['drama', 'mystery', 'thriller', 'secret']
Index : 1956 	 name : Batman Begins	 common_features : ['drama', 'christophernolan', 'christianbale', 'michaelcaine']
Index : 2411 	 name : The Dark Knight	 common_features : ['drama', 'christophernolan', 'christianbale', 'michaelcaine']
Index : 3130 	 name : The Dark Knight Rises	 common_features : ['drama', 'christophernolan', 'christianbale', 'michaelcaine']
Index : 1792 	 name : Exorcist: The Beginn

In [None]:
set1.intersection(set2)