In [1]:
# this content based filtering system 
# this is based ONLY ON movie overviews and taglines 

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity

In [3]:
ls=pd.read_csv("links_small.csv")

In [4]:
df_movies=pd.read_csv("movies_metadata.csv",low_memory=False)

In [5]:
# to see how we came up with these numbers see the EDA notebook 
# we have dropped these rows because the data is not correctly entered which may cause problems while analysis
df_movies=df_movies.drop([19730,29503,35587])


In [6]:
# converted the id to an integer value (from a string)
df_movies["id"]=df_movies["id"].astype('int')

In [7]:
# we get all the non null "tmdbid ids" and convert them to integers
ls=ls[ls["tmdbId"].notnull()]["tmdbId"].astype('int')

In [8]:
ls

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [9]:
# we create a dataframe where all the id's (common to both the dataframes) are present   
df_common_id=df_movies[df_movies["id"].isin(ls)]

In [10]:
# we have around 9100 common movie id's
df_common_id.shape

(9099, 24)

In [20]:
# created a new all words colum which has all the text (the tagline text plus the overview text)
df_common_id["allwords"]=df_common_id["overview"]+df_common_id["tagline"]

In [13]:
df_common_id["allwords"]

0                                                      NaN
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
40224    From the mind behind Evangelion comes a hit la...
40503    The band stormed Europe in 1963, and, in 1964,...
44821    When Molly Hale's sadness of her father's disa...
44826                                                  NaN
45265                                                  NaN
Name: allwords, Length: 9099, dtype: object

In [21]:
# replaced the NaN values with an empty string
df_common_id["allwords"]=df_common_id["allwords"].fillna('')

In [15]:
# now basically we need to suggest the movies based on taglies and overview , so firstly we need to convert the the words 
# into some kind of quantative values

# so we use the TF-IDF (term frequency-inverse document frequency)
# purpose of TF-IDF is to highlight words which are frequent in a document(in this case a particular entry in the "allwords") but not across documents 


In [16]:
# An example of a feature vector you might be familiar with is RGB (red-green-blue) color description
#A color can be described by how much red, blue, and green there is in it
#A feature vector for this would be color = [R, G, B]

In [17]:
# then we are fitting the document on the Tfidvectorizer model
tf=TfidfVectorizer(analyzer="word",ngram_range=(1,2),min_df=0,stop_words="english")
tfid_mat=tf.fit_transform(df_common_id["allwords"])

In [18]:
tf

TfidfVectorizer(min_df=0, ngram_range=(1, 2), stop_words='english')

In [33]:
print(tf.idf_)

[8.50659178 9.0174174  9.42288251 ... 9.42288251 9.42288251 9.42288251]


In [None]:
# cosine similarity can be used to predict the similarity between 2 movies

In [None]:
#Since we have used the TF-IDF Vectorizer, computing Dot Product will give us the Cosine Similarity Score. 
#we used sklearn's linear_kernel instead of cosine_similarities as it is faster

In [22]:
cos_sim=linear_kernel(tfid_mat,tfid_mat)

In [37]:
# this is bascially the pairwise similarity between each and every movie 
cos_sim

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.01604733, ..., 0.00390818, 0.        ,
        0.        ],
       [0.        , 0.01604733, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.00390818, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [24]:
df_common_id=df_common_id.reset_index()

In [25]:
movie_names=df_common_id["title"]

In [26]:
name_series=pd.Series(df_common_id.index,index=df_common_id["title"])

In [27]:
name_series

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
Shin Godzilla                                         9094
The Beatles: Eight Days a Week - The Touring Years    9095
Pokémon: Spell of the Unknown                         9096
Pokémon 4Ever: Celebi - Voice of the Forest           9097
Force Majeure                                         9098
Length: 9099, dtype: int64

In [38]:
# this function returns 20 most similar movies to the based on cosine similarity score 
def recommend_related_to(title):
    index=name_series[title]
    s_score=list(enumerate(cos_sim[index]))
    s_score=sorted(s_score,key= lambda x:x[1],reverse=True)
    s_score=s_score[1:21]
    movie_ind=[i[0] for i in s_score]
    return movie_names.iloc[movie_ind]
    

In [39]:
recommend_related_to("Star Wars").head(10)

949                          The Empire Strikes Back
8755                    Star Wars: The Force Awakens
962                               Return of the Jedi
6690                                 Shrek the Third
6125    Star Wars: Episode III - Revenge of the Sith
4815                               Where Eagles Dare
7539                             Shrek Forever After
2896                 On Her Majesty's Secret Service
515                                 Princess Caraboo
5805                                 The Ice Pirates
Name: title, dtype: object

In [40]:
recommend_related_to("Life Is Beautiful").head(10)

472        Schindler's List
6971     The Rape of Europa
3765        The Big Red One
823              Cinderella
4370      Escape to Victory
8660    Challenge to Lassie
8129               Sinister
1495           Mrs. Miniver
4624        Mr & Mrs Bridge
2135           Notting Hill
Name: title, dtype: object