# Get movies recommendations with TfidfVectorizer

In [58]:
import pandas as pd
movies_metadata = pd.read_csv('data/movies_metadata_sample.csv', index_col='id')

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Drop NaN with an empty string
movies_metadata = movies_metadata.dropna(subset='overview')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(986, 9770)

In [60]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#Construct a reverse map of indices and movie titles
indices = pd.Series(movies_metadata.index, index=movies_metadata['title']).drop_duplicates()

In [61]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_metadata['title'].iloc[movie_indices]

In [62]:
indices.head()

title
Night of the Zombies              29072
Pecoross' Mother and Her Days    255391
Mad Dog Morgan                    60269
Race Street                       73983
Wooden Crosses                    32859
Name: id, dtype: int64

Check for movies titles

Display recommendations for a title

In [65]:
get_recommendations('To Catch a Thief')


id
9526      A Prairie Home Companion
46503              All Good Things
390357        King of the Belgians
12521                      Shocker
36964                       Pucked
126555         Princess Goldilocks
126777              If I Were King
84420         Magic Christmas Tree
134372           Marriage Material
51735             The Dust of Time
Name: title, dtype: object

# Another example to get predictions for a title
CountVectorizer
cosine_similarity 

In [66]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_metadata['title'])

In [67]:
count_matrix.shape

(986, 1522)

In [68]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [69]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = movies_metadata['title'].reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

## Print some recommendations for a movie title with cosine similarity

In [71]:
get_recommendations('Princess Goldilocks', cosine_sim2)

id
37345                  Princess Caraboo
149832         The Princess and the Pea
29072              Night of the Zombies
255391    Pecoross' Mother and Her Days
60269                    Mad Dog Morgan
73983                       Race Street
32859                    Wooden Crosses
26030                   Bob le Flambeur
148284                         Enthiran
447061            Red Nose Day Actually
Name: title, dtype: object

In [72]:
get_recommendations('Tycoon', cosine_sim2)

id
29072              Night of the Zombies
255391    Pecoross' Mother and Her Days
60269                    Mad Dog Morgan
73983                       Race Street
32859                    Wooden Crosses
26030                   Bob le Flambeur
148284                         Enthiran
447061            Red Nose Day Actually
80853                 Tennessee Johnson
24553        All the Invisible Children
Name: title, dtype: object

In [None]:
get_recommendations('tycoon', cosine_sim2)

id
29072             nightofthezombies
255391    pecoross'motherandherdays
60269                  maddogmorgan
73983                    racestreet
32859                 woodencrosses
26030                 bobleflambeur
148284                     enthiran
447061           rednosedayactually
80853              tennesseejohnson
24553       alltheinvisiblechildren
Name: title, dtype: object