In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.compose import make_column_selector

In [2]:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [3]:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [4]:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366349 entries, 0 to 1366348
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366349 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [6]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10288060 entries, 0 to 10288059
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 706.4+ MB


In [7]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37674437 entries, 0 to 37674436
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [8]:
#BASIC
basics.replace({'\\N':np.nan}, inplace =True)

In [9]:
basics.dropna(subset=['runtimeMinutes'], inplace = True)

In [10]:
basics.dropna(subset=['genres'], inplace = True)

In [23]:
basics.dropna(subset=['startYear'], inplace = True)

In [26]:
basics['startYear'] = basics['startYear'].astype(str)
print(basics.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object


In [12]:
cat_cols = make_column_selector(dtype_include='object')(basics)
for col in cat_cols:
  print(f'Value Counts for {col}')
  print(basics[col].value_counts(normalize=True))
  print('\n')

Value Counts for tconst
tconst
tt0000001     3.312776e-07
tt29090231    3.312776e-07
tt29090218    3.312776e-07
tt29090219    3.312776e-07
tt29090220    3.312776e-07
                  ...     
tt13517030    3.312776e-07
tt13517038    3.312776e-07
tt13517062    3.312776e-07
tt13517078    3.312776e-07
tt9916880     3.312776e-07
Name: proportion, Length: 3018616, dtype: float64


Value Counts for titleType
titleType
tvEpisode       0.526447
short           0.205036
movie           0.129795
video           0.061700
tvMovie         0.030821
tvSeries        0.030713
tvSpecial       0.006454
tvMiniSeries    0.006017
tvShort         0.002904
videoGame       0.000113
Name: proportion, dtype: float64


Value Counts for primaryTitle
primaryTitle
Episode #1.1                                                                      2.765841e-03
Episode #1.2                                                                      2.549185e-03
Episode #1.3                                                     

In [13]:
basics=basics.query("titleType == 'movie'")

In [29]:
basics = basics[(basics['startYear'] > '2000') & (basics['startYear'] <= '2021')]

In [14]:
#basics = basics.drop(basics[basics['startYear'] > '2021'].index)

In [30]:
basics = basics.drop(basics[basics['genres'] == 'Documentary'].index)

In [37]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [38]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93927,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
101030,tt0103340,movie,Life for Life: Maximilian Kolbe,Zycie za zycie. Maksymilian Kolbe,0,2006,,90,"Biography,Drama"


In [39]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [40]:
#AKAS
akas.replace({'\\N':np.nan}, inplace =True)

In [41]:
akas=akas.query("region == 'US'")

In [42]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [43]:
ratings.head()

Unnamed: 0,tconst\taverageRating\tnumVotes
0,tt0000001\t5.7\t2004
1,tt0000002\t5.8\t269
2,tt0000003\t6.5\t1903
3,tt0000004\t5.5\t178
4,tt0000005\t6.2\t2685


In [44]:
#Ratings
ratings.replace({'\\N':np.nan}, inplace =True)

In [45]:
# Filter one dataframe based on another
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34800        True
80548        True
86790        True
93927        True
101030       True
            ...  
10271668    False
10272879     True
10280514     True
10282922     True
10284837    False
Name: tconst, Length: 37061, dtype: bool

In [46]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93927,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
101030,tt0103340,movie,Life for Life: Maximilian Kolbe,Zycie za zycie. Maksymilian Kolbe,0,2006,,90,"Biography,Drama"
...,...,...,...,...,...,...,...,...,...
10269386,tt9876412,movie,Star Trek: The Final Darkness,Star Trek: The Final Darkness,0,2004,,85,Sci-Fi
10270492,tt9878852,movie,The Prodigy: Music in Review,The Prodigy: Music in Review,0,2007,,60,Music
10272879,tt9884086,movie,Flashback,Flashback,0,2009,,80,Thriller
10280514,tt9900614,movie,Lady Boss,Lady Boss,0,2005,,111,"Comedy,Family,Romance"


In [47]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20544 entries, 34800 to 10282922
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          20544 non-null  object
 1   titleType       20544 non-null  object
 2   primaryTitle    20544 non-null  object
 3   originalTitle   20544 non-null  object
 4   isAdult         20544 non-null  object
 5   startYear       20544 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  20544 non-null  object
 8   genres          20544 non-null  object
dtypes: object(9)
memory usage: 1.6+ MB


In [48]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1475342 entries, 5 to 37674181
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1475342 non-null  object
 1   ordering         1475342 non-null  int64 
 2   title            1475340 non-null  object
 3   region           1475342 non-null  object
 4   language         4194 non-null     object
 5   types            985194 non-null   object
 6   attributes       47688 non-null    object
 7   isOriginalTitle  1474001 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.3+ MB


In [49]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366349 entries, 0 to 1366348
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366349 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [51]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


[]

In [52]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [54]:
## Save current dataframe to file.
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [55]:
## Save current dataframe to file.
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [53]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
2,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
3,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
4,tt0103340,movie,Life for Life: Maximilian Kolbe,Zycie za zycie. Maksymilian Kolbe,0,2006,,90,"Biography,Drama"
