In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.compose import make_column_selector

In [2]:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [3]:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [4]:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366672 entries, 0 to 1366671
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366672 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [6]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10293183 entries, 0 to 10293182
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 706.8+ MB


In [7]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37692942 entries, 0 to 37692941
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [8]:
#BASIC
basics.replace({'\\N':np.nan}, inplace =True)

In [9]:
basics.dropna(subset=['runtimeMinutes'], inplace = True)

In [10]:
basics.dropna(subset=['genres'], inplace = True)

In [11]:
basics.dropna(subset=['startYear'], inplace = True)

In [12]:
#basics['startYear'] = basics['startYear'].astype(str)
#print(basics.dtypes)

In [12]:
cat_cols = make_column_selector(dtype_include='object')(basics)
for col in cat_cols:
  print(f'Value Counts for {col}')
  print(basics[col].value_counts(normalize=True))
  print('\n')

Value Counts for tconst
tconst
tt0000001     3.535280e-07
tt3009538     3.535280e-07
tt3009498     3.535280e-07
tt3009500     3.535280e-07
tt3009508     3.535280e-07
                  ...     
tt1357037     3.535280e-07
tt13570382    3.535280e-07
tt13570384    3.535280e-07
tt13570388    3.535280e-07
tt9916880     3.535280e-07
Name: proportion, Length: 2828630, dtype: float64


Value Counts for titleType
titleType
tvEpisode       0.500217
short           0.216247
movie           0.136191
video           0.065660
tvMovie         0.032773
tvSeries        0.032474
tvSpecial       0.006878
tvMiniSeries    0.006372
tvShort         0.003069
videoGame       0.000119
Name: proportion, dtype: float64


Value Counts for primaryTitle
primaryTitle
Episode #1.1                                                  2.914489e-03
Episode #1.2                                                  2.658181e-03
Episode #1.3                                                  2.426973e-03
Episode #1.4                  

In [13]:
basics=basics.query("titleType == 'movie'")

In [14]:
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] <= '2021')]

In [15]:
basics = basics.drop(basics[basics['genres'] == 'Documentary'].index)

In [16]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [17]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [18]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [19]:
#AKAS
akas.replace({'\\N':np.nan}, inplace =True)

In [20]:
akas=akas.query("region == 'US'")

In [21]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [22]:
ratings.head()

Unnamed: 0,tconst\taverageRating\tnumVotes
0,tt0000001\t5.7\t2005
1,tt0000002\t5.8\t269
2,tt0000003\t6.5\t1907
3,tt0000004\t5.5\t178
4,tt0000005\t6.2\t2687


In [23]:
#Ratings
ratings.replace({'\\N':np.nan}, inplace =True)

In [24]:
# Filter one dataframe based on another
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34800        True
61111        True
67663        True
80548        True
86790        True
            ...  
10292865     True
10292904    False
10292949     True
10293033    False
10293123    False
Name: tconst, Length: 139323, dtype: bool

In [25]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
10292322,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10292716,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,,97,"Comedy,Drama,Fantasy"
10292856,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10292865,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [26]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82191 entries, 34800 to 10292949
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82191 non-null  object
 1   titleType       82191 non-null  object
 2   primaryTitle    82190 non-null  object
 3   originalTitle   82190 non-null  object
 4   isAdult         82191 non-null  object
 5   startYear       82191 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  82191 non-null  object
 8   genres          82191 non-null  object
dtypes: object(9)
memory usage: 6.3+ MB


In [27]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1475723 entries, 5 to 37692686
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1475723 non-null  object
 1   ordering         1475723 non-null  int64 
 2   title            1475721 non-null  object
 3   region           1475723 non-null  object
 4   language         4195 non-null     object
 5   types            985306 non-null   object
 6   attributes       47693 non-null    object
 7   isOriginalTitle  1474382 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.3+ MB


In [28]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366672 entries, 0 to 1366671
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366672 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [29]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['akas.csv.gz', 'ratings.csv.gz', 'title_basics.csv.gz']

In [30]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [31]:
## Save current dataframe to file.
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [32]:
## Save current dataframe to file.
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [33]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
