In [1]:
import pandas as pd
import numpy as np


In [2]:
from google.colab import files

In [3]:
uploaded=files.upload()

Saving IMDB-Movie-Data.csv to IMDB-Movie-Data.csv


In [4]:
import io
df=pd.read_csv(io.BytesIO(uploaded['IMDB-Movie-Data.csv']))
df.head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0


In other to compare the words in the genres we need to vectorize the words, I used the Term Frequency-Inverse Document Frequency (TF-IDF) which is in scikit-learn library to produce a word vector matrix


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

Define and remove English stop words like 'and', 'the', 'a', 'an' etc

In [6]:
tfidf = TfidfVectorizer(stop_words='english')

Replace missing data NaN with empty string ''

In [7]:
df['Description'] = df['Description'].fillna('')


fit and transform the data in other to build the matrix 

In [8]:
tfidf_matrix = tfidf.fit_transform(df['Description'])
tfidf_matrix.shape

(1000, 5667)

In [9]:
tfidf.get_feature_names()[5000:5010]

['temporal',
 'temporary',
 'temptation',
 'temptations',
 'tenacious',
 'tenant',
 'tenants',
 'tensions',
 'tenure',
 'tepes']

In [10]:
#21 distinct words from 1000 movies

In [11]:
#1 used sklearn's linear_kernel() to get a similarity score between the word vectors

In [12]:
from sklearn.metrics.pairwise import linear_kernel
sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
sim.shape

(1000, 1000)

In [14]:
sim[1]

array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.07710322,
       0.        , 0.04299795, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.08556823,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05245538, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02955961, 0.06336434, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

I created a reverse map of the title

In [15]:
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

In [16]:
indices[:10]

Title
Guardians of the Galaxy    0
Prometheus                 1
Split                      2
Sing                       3
Suicide Squad              4
The Great Wall             5
La La Land                 6
Mindhorn                   7
The Lost City of Z         8
Passengers                 9
dtype: int64

I created a function that takes title of movie as input and gives an output of top 5 recommendation


In [17]:
def get_recommendations(Title, sim=sim):
  idx = indices[Title]
  sim_scores = list(enumerate(sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return df['Title'].iloc[movie_indices]
  

In [18]:
get_recommendations('Kingsman: The Secret Service').head(5)

166    Now You See Me 2
4         Suicide Squad
383     Bridge of Spies
597             Grimsby
466                Salt
Name: Title, dtype: object