TMDB Movies Dataset

In [3]:
import pandas as pd

In [4]:
movies_df = pd.read_csv('/content/top10K-TMDB-movies.csv')
movies_df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [6]:
movies_df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [7]:
movies_df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

Feature Selection

In [8]:
movies_df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [9]:
movies_df = movies_df[['id', 'title', 'genre', 'overview', 'vote_average']]
movies_df.head()

Unnamed: 0,id,title,genre,overview,vote_average
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,8.7
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",8.7
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",8.7
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,8.6
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,8.6


In [10]:
# replacing null values with null string
for _ in movies_df:
  movies_df[_] = movies_df[_].fillna('')

In [11]:
# combining the overview, title and genre columns to form a new df
combined_df = movies_df['overview'] + movies_df['genre'] + movies_df['title']
combined_df.head()

0    Framed in the 1940s for the double murder of h...
1    Raj is a rich, carefree, happy-go-lucky second...
2    Spanning the years 1945 to 1955, a chronicle o...
3    The true story of how businessman Oskar Schind...
4    In the continuing saga of the Corleone crime f...
dtype: object

In [12]:
# converting the textual data into feature vectors
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_df)
print(feature_vectors)

  (0, 25604)	0.1344622584085683
  (0, 7804)	0.11058487773720287
  (0, 9581)	0.04480962086184883
  (0, 16103)	0.10984642387744979
  (0, 28196)	0.12295291382440583
  (0, 33275)	0.18165317519060367
  (0, 17380)	0.1620993878936768
  (0, 25599)	0.11774934877252444
  (0, 21963)	0.08794451694671376
  (0, 24638)	0.1332536474741827
  (0, 22732)	0.11319329272819904
  (0, 17024)	0.1086165024388156
  (0, 17255)	0.15094290163300397
  (0, 22966)	0.08035042568809286
  (0, 4453)	0.04898759633361948
  (0, 983)	0.16783461044804937
  (0, 3091)	0.06481268857734351
  (0, 6664)	0.09129138579746301
  (0, 30029)	0.15242031630028768
  (0, 19411)	0.09136296926692676
  (0, 10811)	0.08140997230289097
  (0, 34107)	0.15242031630028768
  (0, 1700)	0.17146936841458296
  (0, 1727)	0.08727394959090479
  (0, 34823)	0.09072778713814175
  :	:
  (9999, 1312)	0.13526717200598667
  (9999, 18402)	0.1335811869117469
  (9999, 15703)	0.11271296978179708
  (9999, 28107)	0.17351265349203915
  (9999, 14110)	0.14012917527860708
  (9

In [13]:
feature_vectors.shape

(10000, 35251)

In [14]:
# getting the simmilarities using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.05647763 0.06289493 ... 0.06334534 0.06980392 0.05548292]
 [0.05647763 1.         0.03866993 ... 0.02727597 0.02893695 0.02663007]
 [0.06289493 0.03866993 1.         ... 0.0329877  0.05030671 0.01466751]
 ...
 [0.06334534 0.02727597 0.0329877  ... 1.         0.03700292 0.0156901 ]
 [0.06980392 0.02893695 0.05030671 ... 0.03700292 1.         0.03487149]
 [0.05548292 0.02663007 0.01466751 ... 0.0156901  0.03487149 1.        ]]


In [15]:
similarity.shape

(10000, 10000)

In [16]:
# creating a list of all the movie names given in the dataset
list_titles = movies_df['title'].tolist()
print(list_titles)



In [17]:
# getting the movie names from the user
movie_name = input('Enter your favourite movie name : ')

Enter your favourite movie name : Spirited Away


In [18]:
# getting the close matches of the movies entered by the users
import difflib
close_match = difflib.get_close_matches(movie_name, list_titles)
print(close_match)

['Spirited Away', 'Swept Away', 'Swept Away']


In [19]:
close_match = close_match[0]
print(close_match)

Spirited Away


In [20]:
# index of the movie with title
index_movie = movies_df[movies_df.title == close_match]['id'].values[0]
print(index_movie)

129


In [21]:
# getting a list of simmilar movies
similarity_score = list(enumerate(similarity[index_movie]))
print(similarity_score)

[(0, 0.0485016286759431), (1, 0.03106957643891788), (2, 0.0324871004271606), (3, 0.05635073359183375), (4, 0.03742140116807384), (5, 0.06392219505980752), (6, 0.024359103006145216), (7, 0.02143417321991079), (8, 0.015050082704198539), (9, 0.03498109352874779), (10, 0.057484766371881796), (11, 0.03391488500036302), (12, 0.016470315374493373), (13, 0.01865248570916251), (14, 0.028465731083205208), (15, 0.044762096027359755), (16, 0.059435806500850265), (17, 0.022441985663460028), (18, 0.056449611860628295), (19, 0.06148804596589426), (20, 0.04058462037221319), (21, 0.033748009907782484), (22, 0.02405986967751925), (23, 0.03875345683030067), (24, 0.03965303953159251), (25, 0.051164555787486854), (26, 0.03566444282094403), (27, 0.04368478444218653), (28, 0.02520510581453527), (29, 0.03976602842041982), (30, 0.08563283340038444), (31, 0.03272379288044401), (32, 0.028591155550752295), (33, 0.017743998898427187), (34, 0.049371008683001615), (35, 0.022594132600643323), (36, 0.02827796245901757

In [22]:
# sorting the movies based on their simmilarity scores
sorted_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_movies)

[(129, 1.0000000000000004), (4547, 0.24366801831615678), (517, 0.22546299928075667), (364, 0.2148752046288188), (8368, 0.17846129577776595), (1635, 0.17832828365286477), (3983, 0.16682096471422705), (6585, 0.1637098347663317), (1179, 0.15952387316318198), (9969, 0.1579843646086591), (1678, 0.1569468383562954), (5921, 0.15592475180768275), (3985, 0.15260102439518508), (1739, 0.14915127491604302), (8271, 0.1458745459441369), (4665, 0.14506384904091543), (8788, 0.13767094370158992), (5470, 0.13429637690776797), (5569, 0.1339080267324762), (4071, 0.132828527023455), (4298, 0.1324109699613377), (541, 0.1313938478315216), (9224, 0.13132450214555003), (9772, 0.13055762362921045), (1929, 0.13051872603997833), (2175, 0.13039888597886898), (3351, 0.12939609703666285), (4715, 0.12805410988810442), (5794, 0.1271689917454471), (287, 0.12683970937338035), (1759, 0.12359038995707297), (351, 0.12325831563414104), (960, 0.12325491584353669), (1928, 0.12280349797403971), (3060, 0.12177021682346251), (52

In [23]:
# print the name of simmilar movies based on the index
print('Movies suggested for you : \n')

i = 1

for movie in sorted_movies:
  index = movie[0]
  title_from_index = movies_df[movies_df.index==index]['title'].values[0]
  if (i<10):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Children of Paradise
2 . Days of Glory
3 . Hiroshima Mon Amour
4 . The Battle of Algiers
5 . Nothing to Declare
6 . Frantz
7 . Escape to Victory
8 . Rifkin's Festival
9 . The French Connection


In [25]:
movie_name = input('Enter your favourite movie name : ')

list_titles = movies_df['title'].tolist()

close_match = difflib.get_close_matches(movie_name, list_titles)

close_match = close_match[0]

index_movie = movies_df[movies_df.title == close_match]['id'].values[0]

similarity_score = list(enumerate(similarity[index_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_df[movies_df.index==index]['title'].values[0]
  if (i<10):
    print(i, '.',title_from_index)
    i+=1

Enter your favourite movie name : Spirited Away
Movies suggested for you : 

1 . Children of Paradise
2 . Days of Glory
3 . Hiroshima Mon Amour
4 . The Battle of Algiers
5 . Nothing to Declare
6 . Frantz
7 . Escape to Victory
8 . Rifkin's Festival
9 . The French Connection
