In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv("../../dataset/movies/IMDB.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,...,averageRating,numVotes,ordering,title,region,language,types,attributes,isOriginalTitle,Description
0,0,0,tt0102926,movie,The Silence of the Lambs,The Silence of the Lambs,0,1991,\N,118,...,8.6,1473918,50,The Silence of the Lambs,US,en,\N,\N,0,"Jodie Foster stars as Clarice Starling, a top ..."
1,1,1,tt0103064,movie,Terminator 2: Judgment Day,Terminator 2: Judgment Day,0,1991,\N,137,...,8.6,1128166,17,Terminator 2: Judgment Day,US,en,dvd,\N,0,"In this sequel set eleven years after ""The Ter..."
2,2,3,tt0110357,movie,The Lion King,The Lion King,0,1994,\N,88,...,8.5,1090882,18,The Lion King 3D,US,en,\N,3-D version,0,This Disney animated feature follows the adven...
3,3,4,tt0110912,movie,Pulp Fiction,Pulp Fiction,0,1994,\N,154,...,8.9,2118762,22,Pulp Fiction,US,en,\N,\N,0,Vincent Vega (John Travolta) and Jules Winnfie...
4,4,5,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,\N,142,...,9.3,2759621,2,The Shawshank Redemption,US,en,\N,\N,0,Andy Dufresne (Tim Robbins) is sentenced to tw...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7850 entries, 0 to 7849
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       7850 non-null   int64  
 1   index            7850 non-null   int64  
 2   tconst           7850 non-null   object 
 3   titleType        7850 non-null   object 
 4   primaryTitle     7850 non-null   object 
 5   originalTitle    7850 non-null   object 
 6   isAdult          7850 non-null   int64  
 7   startYear        7850 non-null   int64  
 8   endYear          7850 non-null   object 
 9   runtimeMinutes   7850 non-null   object 
 10  genres           7850 non-null   object 
 11  averageRating    7850 non-null   float64
 12  numVotes         7850 non-null   int64  
 13  ordering         7850 non-null   int64  
 14  title            7850 non-null   object 
 15  region           7850 non-null   object 
 16  language         7850 non-null   object 
 17  types         

In [5]:
data.describe()

Unnamed: 0.1,Unnamed: 0,index,isAdult,startYear,averageRating,numVotes,ordering,isOriginalTitle
count,7850.0,7850.0,7850.0,7850.0,7850.0,7850.0,7850.0,7850.0
mean,3924.5,5286.804076,0.0,2013.204331,7.367605,75278.45,16.440382,0.0
std,2266.24414,2851.482333,0.0,6.987697,0.605137,177979.4,12.87129,0.0
min,0.0,0.0,0.0,1990.0,6.5,6.0,1.0,0.0
25%,1962.25,2847.25,0.0,2008.0,6.9,6035.0,6.0,0.0
50%,3924.5,5284.5,0.0,2015.0,7.3,14841.5,13.0,0.0
75%,5886.75,7602.75,0.0,2019.0,7.8,56605.0,24.0,0.0
max,7849.0,10274.0,0.0,2023.0,9.5,2759621.0,119.0,0.0


In [6]:
data.drop(["Unnamed: 0", "index"], axis=1, inplace=True)

In [7]:
y = data.title
X = data.genres

In [8]:
X.head()

0         Crime,Drama,Thriller
1                Action,Sci-Fi
2    Adventure,Animation,Drama
3                  Crime,Drama
4                        Drama
Name: genres, dtype: object

In [9]:
import re

def genres(row):
    categories = re.sub("-","",row).lower().split(",")
    genres = " ".join(categories)
    return genres

X = X.apply(genres)

In [10]:
X.head()

0         crime drama thriller
1                 action scifi
2    adventure animation drama
3                  crime drama
4                        drama
Name: genres, dtype: object

In [11]:
X.replace("\\N", np.nan, inplace=True)

In [12]:
mov_idx = pd.Series(y.index, index=y)
mov_idx

title
The Silence of the Lambs                     0
Terminator 2: Judgment Day                   1
The Lion King 3D                             2
Pulp Fiction                                 3
The Shawshank Redemption                     4
                                          ... 
The Blonde One                            7845
Trailer Park Boys: The Animated Series    7846
Two of Us                                 7847
Fin De Siglo                              7848
Blown Away                                7849
Length: 7850, dtype: int64

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)
X

<7850x26 sparse matrix of type '<class 'numpy.float64'>'
	with 19314 stored elements in Compressed Sparse Row format>

In [16]:
def recommend(title):
    """Returns the top 5 recommended films for the user
    inputs:
        - title: a string of the film watched.
    output:
        - returns a list of 5 most recommended films related to the title.
    """
    # locate the index of the movie title in the dataset
    idx = mov_idx[title]
    # if title appears multiple times, use the first occurence
    if type(idx) == pd.Series:
        idx = idx.iloc[0]
    # use the idx to get hold of the location of the movie in the vector space
    query = X[idx]
    # calculate the distance with neighboring points
    scores = cosine_similarity(query, X)
    # align the distances
    scores = scores.flatten()
    # sort out the first 5 shortest distances
    recommended_idx = (-scores).argsort()[1:6]
    # return these shortest distances
    return y.iloc[recommended_idx]

In [17]:
recommend("The Silence of the Lambs")

2526                       The Silence
5940                          Beartown
2502                           Dhamaka
2479                             Sunny
5958    Beasts That Cling to the Straw
Name: title, dtype: object