In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.compose import make_column_selector

In [2]:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [3]:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [4]:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366349 entries, 0 to 1366348
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366349 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [6]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10288060 entries, 0 to 10288059
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 706.4+ MB


In [7]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37674437 entries, 0 to 37674436
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [8]:
#BASIC
basics.replace({'\\N':np.nan}, inplace =True)

In [9]:
basics.dropna(subset=['runtimeMinutes'], inplace = True)

In [10]:
basics.dropna(subset=['genres'], inplace = True)

In [11]:
basics.dropna(subset=['startYear'], inplace = True)

In [26]:
#basics['startYear'] = basics['startYear'].astype(str)
#print(basics.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object


In [12]:
cat_cols = make_column_selector(dtype_include='object')(basics)
for col in cat_cols:
  print(f'Value Counts for {col}')
  print(basics[col].value_counts(normalize=True))
  print('\n')

Value Counts for tconst
tconst
tt0000001     3.536579e-07
tt3010844     3.536579e-07
tt3010758     3.536579e-07
tt3010766     3.536579e-07
tt3010776     3.536579e-07
                  ...     
tt13568848    3.536579e-07
tt13568854    3.536579e-07
tt13568858    3.536579e-07
tt1356885     3.536579e-07
tt9916880     3.536579e-07
Name: proportion, Length: 2827591, dtype: float64


Value Counts for titleType
titleType
tvEpisode       0.500158
short           0.216267
movie           0.136211
video           0.065669
tvMovie         0.032784
tvSeries        0.032475
tvSpecial       0.006877
tvMiniSeries    0.006370
tvShort         0.003069
videoGame       0.000120
Name: proportion, dtype: float64


Value Counts for primaryTitle
primaryTitle
Episode #1.1                 2.914146e-03
Episode #1.2                 2.658450e-03
Episode #1.3                 2.427158e-03
Episode #1.4                 2.224158e-03
Episode #1.5                 1.997109e-03
                                 ...     
Mir

In [13]:
basics=basics.query("titleType == 'movie'")

In [14]:
basics = basics[(basics['startYear'] == '2000')]# & (basics['startYear'] <= '2021')]

In [14]:
#basics = basics.drop(basics[basics['startYear'] > '2021'].index)

In [15]:
basics = basics.drop(basics[basics['genres'] == 'Documentary'].index)

In [16]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [17]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
100065,tt0102362,movie,Istota,Istota,0,2000,,80,"Drama,Romance"
110465,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,,86,"Musical,Romance"
110528,tt0113092,movie,For the Cause,For the Cause,0,2000,,100,"Action,Adventure,Drama"
113272,tt0115937,movie,Consequence,Consequence,0,2000,,91,Drama
113711,tt0116391,movie,Gang,Gang,0,2000,,167,"Action,Crime,Drama"


In [18]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [27]:
#AKAS
akas.replace({'\\N':np.nan}, inplace =True)

In [19]:
akas=akas.query("region == 'US'")

In [20]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [21]:
ratings.head()

Unnamed: 0,tconst\taverageRating\tnumVotes
0,tt0000001\t5.7\t2004
1,tt0000002\t5.8\t269
2,tt0000003\t6.5\t1903
3,tt0000004\t5.5\t178
4,tt0000005\t6.2\t2685


In [22]:
#Ratings
ratings.replace({'\\N':np.nan}, inplace =True)

In [23]:
# Filter one dataframe based on another
keepers = basics['tconst'].isin(akas['titleId'])
keepers

100065      False
110465       True
110528       True
113272       True
113711       True
            ...  
9880284     False
9903677     False
9959916     False
10054275     True
10088732    False
Name: tconst, Length: 2760, dtype: bool

In [24]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
110465,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,,86,"Musical,Romance"
110528,tt0113092,movie,For the Cause,For the Cause,0,2000,,100,"Action,Adventure,Drama"
113272,tt0115937,movie,Consequence,Consequence,0,2000,,91,Drama
113711,tt0116391,movie,Gang,Gang,0,2000,,167,"Action,Crime,Drama"
113937,tt0116628,movie,The Incorporated,The Incorporated,0,2000,,86,"Action,Thriller"
...,...,...,...,...,...,...,...,...,...
9557287,tt8327752,movie,Unknown the Great: The Life & Times of Buddy S...,Unknown the Great: The Life & Times of Buddy S...,0,2000,,77,"Comedy,Music"
9661855,tt8553964,movie,Cotton Fleece,Cotton Fleece,0,2000,,90,Drama
9821254,tt8907070,movie,Lost in the Wilderness,Lost in the Wilderness,0,2000,,77,Comedy
9843228,tt8954964,movie,Good Luck,Good Luck,0,2000,,142,Drama


In [25]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1464 entries, 110465 to 10054275
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          1464 non-null   object
 1   titleType       1464 non-null   object
 2   primaryTitle    1464 non-null   object
 3   originalTitle   1464 non-null   object
 4   isAdult         1464 non-null   object
 5   startYear       1464 non-null   object
 6   endYear         0 non-null      object
 7   runtimeMinutes  1464 non-null   object
 8   genres          1464 non-null   object
dtypes: object(9)
memory usage: 114.4+ KB


In [26]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1475342 entries, 5 to 37674181
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1475342 non-null  object
 1   ordering         1475342 non-null  int64 
 2   title            1475340 non-null  object
 3   region           1475342 non-null  object
 4   language         1475342 non-null  object
 5   types            1475342 non-null  object
 6   attributes       1475342 non-null  object
 7   isOriginalTitle  1475342 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.3+ MB


In [27]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366349 entries, 0 to 1366348
Data columns (total 1 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   tconst	averageRating	numVotes  1366349 non-null  object
dtypes: object(1)
memory usage: 10.4+ MB


In [28]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['akas.csv.gz', 'ratings.csv.gz', 'title_basics.csv.gz']

In [29]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [30]:
## Save current dataframe to file.
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [31]:
## Save current dataframe to file.
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [32]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,,86,"Musical,Romance"
1,tt0113092,movie,For the Cause,For the Cause,0,2000,,100,"Action,Adventure,Drama"
2,tt0115937,movie,Consequence,Consequence,0,2000,,91,Drama
3,tt0116391,movie,Gang,Gang,0,2000,,167,"Action,Crime,Drama"
4,tt0116628,movie,The Incorporated,The Incorporated,0,2000,,86,"Action,Thriller"
