# Imports

In [1]:
import os
os.makedirs("Data/",exist_ok=True)
os.listdir("Data/")

['title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints']

In [2]:
import pandas as pd
import numpy as np

# Load in Data

In [3]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)


In [4]:
aka = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)


In [5]:
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)


In [6]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1989
1,tt0000002,5.8,264
2,tt0000003,6.5,1866
3,tt0000004,5.5,177
4,tt0000005,6.2,2648


In [7]:
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [8]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Preprocessing

## Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)


In [9]:
filtered_aka = aka[aka['region'] == "US"]
filtered_aka

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
1452559,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
1452560,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
1452561,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
1452562,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


## Replace "\N" with np.nan


In [10]:
# Replacing "\N" with np.nan
filtered_aka = filtered_aka.replace({'\\N':np.nan})
filtered_aka

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
1452559,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
1452560,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
1452561,tt9916702,1,Loving London: The Playground,US,,,,0
1452562,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


## Eliminate movies that are null for runtimeMinutes and genres


In [11]:
# Eliminating movies with null runtimeMinutes and genres
basics_filtered = basics.copy()
basics_filtered.dropna(subset=['runtimeMinutes','genres'], inplace=True)
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [12]:
basics_filtered['titleType'].nunique()

11

In [13]:
basics_filtered['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

## Keep only titleType==Movie


In [14]:
basics_filtered = basics_filtered[basics_filtered['titleType'] == "movie"]
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


# Include only movies that were released 2000 - 2021 (include 2000 and 2021)


In [15]:
# Replacing "\N" with np.nan
basics_filtered['startYear'] = basics_filtered['startYear'].replace({'\\N':np.nan})
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


In [16]:
basics_filtered['startYear']=basics_filtered['startYear'].astype(float)
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905.0,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,\N,90,Drama


In [17]:
basics_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 655372 entries, 8 to 10113234
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          655372 non-null  object 
 1   titleType       655372 non-null  object 
 2   primaryTitle    655372 non-null  object 
 3   originalTitle   655372 non-null  object 
 4   isAdult         655372 non-null  object 
 5   startYear       562903 non-null  float64
 6   endYear         655372 non-null  object 
 7   runtimeMinutes  655372 non-null  object 
 8   genres          655372 non-null  object 
dtypes: float64(1), object(8)
memory usage: 50.0+ MB


In [18]:
basics_filtered = basics_filtered[(basics_filtered['startYear'] >= 2000) & (basics_filtered['startYear'] <= 2021)]
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,\N,"Action,Crime"
13081,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,\N,94,Documentary
15176,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000.0,\N,60,\N
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,\N,70,Drama


## Include only fictional movies (not from the Documentary genre)


In [19]:
# Exclude movies that are included in the documentary category.
is_documentary = basics_filtered['genres'].str.contains('Documentary',case=False)
basics_filtered = basics_filtered[~is_documentary]
basics_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,\N,"Action,Crime"
15176,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000.0,\N,60,\N
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,\N,70,Drama
67486,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,\N,90,Drama


In [20]:
basics_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187532 entries, 11636 to 10113224
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          187532 non-null  object 
 1   titleType       187532 non-null  object 
 2   primaryTitle    187532 non-null  object 
 3   originalTitle   187532 non-null  object 
 4   isAdult         187532 non-null  object 
 5   startYear       187532 non-null  float64
 6   endYear         187532 non-null  object 
 7   runtimeMinutes  187532 non-null  object 
 8   genres          187532 non-null  object 
dtypes: float64(1), object(8)
memory usage: 14.3+ MB


In [21]:
filtered_aka.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         4018 non-null     object
 5   types            981678 non-null   object
 6   attributes       47016 non-null    object
 7   isOriginalTitle  1451222 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.7+ MB


# Saving file

In [22]:
## Save current dataframe to file.
basics_filtered.to_csv("Data/title_basics_filtered.csv.gz",compression='gzip',index=False)

In [23]:
filtered_aka.to_csv("Data/title-akas-us-only_filtered.csv",index=False)

In [25]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics_filtered.csv.gz", low_memory = False)
basics.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,\N,"Action,Crime"
1,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000.0,\N,60,\N
2,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
3,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,\N,70,Drama
4,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,\N,90,Drama


In [26]:
aka = pd.read_csv("Data/title-akas-us-only_filtered.csv", low_memory = False)
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
