## Simple content based recomendation system##


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
dataset = pd.read_csv('movies_metadata.csv', low_memory = False)

In [3]:
dataset.head(7)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
6,False,,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,1995-12-15,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0


In [4]:
C = dataset['vote_average'].mean()
print(C)

5.618207215133889


In [8]:
m = dataset['vote_count'].quantile(0.70)
print(m)

25.0


In [9]:
possible_movies = dataset.copy().loc[dataset['vote_count'] >= m]
possible_movies.size

331440

In [10]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return ((v/(v+m)) * R) + ((m/(m+v)) * C)

In [11]:
possible_movies['score'] = possible_movies.apply(weighted_rating, axis=1)

In [12]:
possible_movies.size

345250

In [13]:
#Sort movies based on score calculated above
possible_movies = possible_movies.sort_values('score', ascending=False)

#Print the top 15 movies
possible_movies[['title', 'vote_count', 'vote_average', 'score']].head(1500)

Unnamed: 0,title,vote_count,vote_average,score
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.973113
314,The Shawshank Redemption,8358.0,8.5,8.491406
834,The Godfather,6024.0,8.5,8.488090
40251,Your Name.,1030.0,8.5,8.431711
39085,Planet Earth,176.0,8.8,8.404255
12481,The Dark Knight,12269.0,8.3,8.294547
2843,Fight Club,9678.0,8.3,8.293090
292,Pulp Fiction,8670.0,8.3,8.292289
522,Schindler's List,4436.0,8.3,8.284971
23673,Whiplash,4376.0,8.3,8.284766


In [14]:
dataset['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [15]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words ='english')

#Replace NaN with an empty string
possible_movies['overview'] = possible_movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(possible_movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(13810, 36773)

In [16]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape


(13810, 13810)

In [17]:
indices = pd.Series(possible_movies.index, index=possible_movies['title']).drop_duplicates()


In [18]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return possible_movies['title'].iloc[movie_indices]

In [19]:
get_recommendations('Toy Story')

30283              I Hate Luv Storys
15020                  Chalte Chalte
35701                        Dilwale
1417     Kama Sutra - A Tale of Love
6935              A Passage to India
21256             Bhaag Milkha Bhaag
14814                   Love Stories
18782               Paan Singh Tomar
12059                Rang De Basanti
10246       Mr. Canton and Lady Rose
Name: title, dtype: object

In [20]:
get_recommendations('The Godfather')

7938           The Bourne Supremacy
5284            The Bourne Identity
18255             The Bourne Legacy
39287                  Jason Bourne
4560          The Cat o' Nine Tails
13880                         Brüno
249      Interview with the Vampire
12320                         [REC]
2125                    Simon Birch
1999                     L.A. Story
Name: title, dtype: object

In [21]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# Remove rows with bad IDs.
#possible_movies = possible_movies.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
possible_movies['id'] = possible_movies['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
possible_movies = possible_movies.merge(credits, on='id')
possible_movies = possible_movies.merge(keywords, on='id')



In [22]:
possible_movies.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,score,cast,crew,keywords
0,False,,13200000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,19404,tt0112870,hi,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",...,Released,Come... Fall In Love,Dilwale Dulhania Le Jayenge,False,9.1,661.0,8.973113,"[{'cast_id': 1, 'character': 'Raj Malhotra', '...","[{'credit_id': '57a3054a9251417c57000d7a', 'de...","[{'id': 4344, 'name': 'musical'}]"
1,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,8.491406,"[{'cast_id': 3, 'character': 'Andy Dufresne', ...","[{'credit_id': '52fe4231c3a36847f800b127', 'de...","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."


In [23]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    possible_movies[feature] = possible_movies[feature].apply(literal_eval)

In [24]:
# Import Numpy 
import numpy as np
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [25]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [26]:
# Define new director, cast, genres and keywords features that are in a suitable form.
possible_movies['director'] = possible_movies['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    possible_movies[feature] = possible_movies[feature].apply(get_list)

In [27]:
possible_movies[['title', 'cast', 'director', 'keywords', 'genres']].head(15)

Unnamed: 0,title,cast,director,keywords,genres
0,Dilwale Dulhania Le Jayenge,"[Shah Rukh Khan, Kajol, Amrish Puri]",Aditya Chopra,[musical],"[Comedy, Drama, Romance]"
1,The Shawshank Redemption,"[Tim Robbins, Morgan Freeman, Bob Gunton]",Frank Darabont,"[prison, corruption, police brutality]","[Drama, Crime]"
2,The Godfather,"[Marlon Brando, Al Pacino, James Caan]",Francis Ford Coppola,"[italy, love at first sight, loss of father]","[Drama, Crime]"
3,Your Name.,"[Ryunosuke Kamiki, Mone Kamishiraishi, Masami ...",Makoto Shinkai,"[supernatural, romance, school]","[Romance, Animation, Drama]"
4,Planet Earth,[David Attenborough],Alastair Fothergill,"[miniseries, great cinematpgraphy]",[Documentary]
5,The Dark Knight,"[Christian Bale, Michael Caine, Heath Ledger]",Christopher Nolan,"[dc comics, crime fighter, secret identity]","[Drama, Action, Crime]"
6,Fight Club,"[Edward Norton, Brad Pitt, Meat Loaf]",David Fincher,"[support group, dual identity, nihilism]",[Drama]
7,Pulp Fiction,"[John Travolta, Samuel L. Jackson, Uma Thurman]",Quentin Tarantino,"[transporter, brothel, drug dealer]","[Thriller, Crime]"
8,Schindler's List,"[Liam Neeson, Ben Kingsley, Ralph Fiennes]",Steven Spielberg,"[factory, concentration camp, hero]","[Drama, History, War]"
9,Whiplash,"[Miles Teller, J.K. Simmons, Melissa Benoist]",Damien Chazelle,"[jazz, obsession, conservatory]",[Drama]


In [28]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    possible_movies[feature] = possible_movies[feature].apply(clean_data)

In [29]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
# Create a new soup feature
possible_movies['soup'] = possible_movies.apply(create_soup, axis=1)
possible_movies[['soup']].head(15)

Unnamed: 0,soup
0,musical shahrukhkhan kajol amrishpuri adityach...
1,prison corruption policebrutality timrobbins m...
2,italy loveatfirstsight lossoffather marlonbran...
3,supernatural romance school ryunosukekamiki mo...
4,miniseries greatcinematpgraphy davidattenborou...
5,dccomics crimefighter secretidentity christian...
6,supportgroup dualidentity nihilism edwardnorto...
7,transporter brothel drugdealer johntravolta sa...
8,factory concentrationcamp hero liamneeson benk...
9,jazz obsession conservatory milesteller j.k.si...


In [30]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(possible_movies['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [31]:
# Reset index of your main DataFrame and construct reverse mapping as before
possible_movies = possible_movies.reset_index()
indices = pd.Series(possible_movies.index, index=possible_movies['title'])

In [32]:
get_recommendations('Toy Story', cosine_sim2)

346                          Toy Story 3
832                          Toy Story 2
3056          Toy Story That Time Forgot
1229                Toy Story of Terror!
4102                   Creature Comforts
1776                     Partysaurus Rex
1864                              Banana
6538    The Bugs Bunny/Road Runner Movie
5903                       Monster House
9662                              Cars 2
Name: title, dtype: object

In [33]:
get_recommendations('The Godfather', cosine_sim2)

1445      The Godfather: Part III
12         The Godfather: Part II
7356            The Good Neighbor
13286           The Son of No One
70                 Apocalypse Now
896      The Consequences of Love
11128            Gardens of Stone
69                       Scarface
131             On the Waterfront
265                          Heat
Name: title, dtype: object

In [34]:
get_recommendations('Heat', cosine_sim2)

12278           No Good Deed
11419                Armored
10134        Lost in the Sun
6139               Dobermann
11334            Tiger House
69                  Scarface
298            Carlito's Way
427        Dog Day Afternoon
683      Glengarry Glen Ross
788        Shadow of a Doubt
Name: title, dtype: object