In [19]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.compose import make_column_selector

In [56]:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [7]:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [8]:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360168 entries, 0 to 1360167
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1360168 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [9]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10246265 entries, 0 to 10246264
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 703.6+ MB


In [10]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37517220 entries, 0 to 37517219
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [48]:
#BASIC
basics.replace({'\\N':np.nan}, inplace =True)

In [15]:
basics.dropna(subset=['runtimeMinutes'], inplace = True)

In [17]:
basics.dropna(subset=['genres'], inplace = True)

In [20]:
cat_cols = make_column_selector(dtype_include='object')(basics)
for col in cat_cols:
  print(f'Value Counts for {col}')
  print(basics[col].value_counts(normalize=True))
  print('\n')

Value Counts for tconst
tt0000001     9.759671e-08
tt28379155    9.759671e-08
tt28379149    9.759671e-08
tt2837914     9.759671e-08
tt28379150    9.759671e-08
                  ...     
tt14478062    9.759671e-08
tt14478066    9.759671e-08
tt14478068    9.759671e-08
tt1447806     9.759671e-08
tt9916880     9.759671e-08
Name: tconst, Length: 10246247, dtype: float64


Value Counts for titleType
tvEpisode       7.624369e-01
short           9.342006e-02
movie           6.443354e-02
video           2.748958e-02
tvSeries        2.447960e-02
tvMovie         1.397936e-02
tvMiniSeries    4.962207e-03
tvSpecial       4.282348e-03
videoGame       3.536319e-03
tvShort         9.799686e-04
tvPilot         9.759671e-08
Name: titleType, dtype: float64


Value Counts for primaryTitle
Episode #1.1                 4.827236e-03
Episode #1.2                 4.360626e-03
Episode #1.3                 4.129126e-03
Episode #1.4                 3.822867e-03
Episode #1.5                 3.499919e-03
          

In [27]:
basics=basics.query("titleType == 'movie'")

In [99]:
basics = basics.drop(basics[basics['startYear'] < '2000'].index)

In [101]:
basics = basics.drop(basics[basics['startYear'] > '2021'].index)

In [84]:
basics = basics.drop(basics[basics['genres'] == 'Documentary'].index)

In [100]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
69150,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,,78,Drama


In [86]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [51]:
#AKAS
akas.replace({'\\N':np.nan}, inplace =True)

In [52]:
akas=akas.query("region == 'US'")

In [53]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [57]:
ratings.head()

Unnamed: 0,tconst\taverageRating\tnumVotes
0,tt0000001\t5.7\t2002
1,tt0000002\t5.8\t269
2,tt0000003\t6.5\t1893
3,tt0000004\t5.5\t178
4,tt0000005\t6.2\t2678


In [58]:
#Ratings
ratings.replace({'\\N':np.nan}, inplace =True)

In [105]:
# Filter one dataframe based on another
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34800       True
61111       True
67485       True
67663       True
80548       True
            ... 
10245946    True
10245947    True
10246031    True
10246063    True
10246155    True
Name: tconst, Length: 166229, dtype: bool

In [106]:
basics = basics[keepers]

In [108]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166229 entries, 34800 to 10246155
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          166229 non-null  object 
 1   titleType       166229 non-null  object 
 2   primaryTitle    166229 non-null  object 
 3   originalTitle   166229 non-null  object 
 4   isAdult         166229 non-null  object 
 5   startYear       110769 non-null  object 
 6   endYear         0 non-null       float64
 7   runtimeMinutes  99184 non-null   object 
 8   genres          161792 non-null  object 
dtypes: float64(1), object(8)
memory usage: 12.7+ MB


In [109]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1472163 entries, 5 to 37516964
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1472163 non-null  object
 1   ordering         1472163 non-null  int64 
 2   title            1472163 non-null  object
 3   region           1472163 non-null  object
 4   language         4168 non-null     object
 5   types            984662 non-null   object
 6   attributes       47623 non-null    object
 7   isOriginalTitle  1470822 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.1+ MB


In [64]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360168 entries, 0 to 1360167
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1360168 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [110]:
basics.to_csv("basics.csv", index = False)

In [103]:
akas.to_csv("akas.csv", index = False)

In [104]:
ratings.to_csv("ratings.csv", index = False)