# Imports

In [2]:
# Imports 
import pandas as pd
import numpy as np

# Data Dictionary

**IMDb Datasets**

Subsets of IMDb data are available for access to customers for personal and non-commercial use. You can hold local copies of this data, and it is subject to our terms and conditions. Please refer to the Non-Commercial Licensing and copyright/license and verify compliance.

**Data Location**

The dataset files can be accessed and downloaded from https://datasets.imdbws.com/. The data is refreshed daily.

**IMDb Dataset Details**

Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. The first line in each file contains headers that describe what is in each column. A ‘\N’ is used to denote that a particular field is missing or null for that title/name. The available datasets are as follows:

**title.akas.tsv.gz** - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title

- ordering (integer) – a number to uniquely identify rows for a given titleId

- title (string) – the localized title

- region (string) - the region for this version of the title

- language (string) - the language of the title

- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning

- attributes (array) - Additional terms to describe this alternative title, not enumerated

- isOriginalTitle (boolean) – 0: not original title; 1: original title

**title.basics.tsv.gz** - Contains the following information for titles:

- tconst (string) - alphanumeric unique identifier of the title

- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)

- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release

- originalTitle (string) - original title, in the original language

- isAdult (boolean) - 0: non-adult title; 1: adult title

- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year

- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types

- runtimeMinutes – primary runtime of the title, in minutes

- genres (string array) – includes up to three genres associated with the title

**title.ratings.tsv.gz** – Contains the IMDb rating and votes information for titles
- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received

# Loading the Data

In [20]:
# URLs for each dataframe needed
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
# Loading the data for basics
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [21]:
# Loading the data for ratings
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=True)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1811
3,tt0000004,5.6,178
4,tt0000005,6.2,2609


# Loading/Cleaning Large Akas Dataset 

In [22]:
chunk_num = 1
df_reader = pd.read_csv(akas_url, sep='\t',
                        low_memory=False, chunksize=100_000)

In [23]:
# Due to akas dataset  being so large in size, I will have to download it in smaller chunks
for temp_df in df_reader:
    # Replacing the '\N' with np.nan
    temp_df.replace({'\\N':np.nan}, inplace=True)
    # Limiting to just movies in the US
    temp_df = temp_df[(temp_df['region'] == 'US')] 
    
    ### Saving chunk to disk
    fname= f'Data/title_akas_chunk_{chunk_num:03d}.csv.gz'
    temp_df.to_csv(fname, compression='gzip')
    print(f"- Saved {fname}")
    
    ## increment chunk_num    
    chunk_num+=1
## Closing the reader now that we are done looping through the file
df_reader.close()

- Saved Data/title_akas_chunk_001.csv.gz
- Saved Data/title_akas_chunk_002.csv.gz
- Saved Data/title_akas_chunk_003.csv.gz
- Saved Data/title_akas_chunk_004.csv.gz
- Saved Data/title_akas_chunk_005.csv.gz
- Saved Data/title_akas_chunk_006.csv.gz
- Saved Data/title_akas_chunk_007.csv.gz
- Saved Data/title_akas_chunk_008.csv.gz
- Saved Data/title_akas_chunk_009.csv.gz
- Saved Data/title_akas_chunk_010.csv.gz
- Saved Data/title_akas_chunk_011.csv.gz
- Saved Data/title_akas_chunk_012.csv.gz
- Saved Data/title_akas_chunk_013.csv.gz
- Saved Data/title_akas_chunk_014.csv.gz
- Saved Data/title_akas_chunk_015.csv.gz
- Saved Data/title_akas_chunk_016.csv.gz
- Saved Data/title_akas_chunk_017.csv.gz
- Saved Data/title_akas_chunk_018.csv.gz
- Saved Data/title_akas_chunk_019.csv.gz
- Saved Data/title_akas_chunk_020.csv.gz
- Saved Data/title_akas_chunk_021.csv.gz
- Saved Data/title_akas_chunk_022.csv.gz
- Saved Data/title_akas_chunk_023.csv.gz
- Saved Data/title_akas_chunk_024.csv.gz
- Saved Data/tit

- Saved Data/title_akas_chunk_201.csv.gz
- Saved Data/title_akas_chunk_202.csv.gz
- Saved Data/title_akas_chunk_203.csv.gz
- Saved Data/title_akas_chunk_204.csv.gz
- Saved Data/title_akas_chunk_205.csv.gz
- Saved Data/title_akas_chunk_206.csv.gz
- Saved Data/title_akas_chunk_207.csv.gz
- Saved Data/title_akas_chunk_208.csv.gz
- Saved Data/title_akas_chunk_209.csv.gz
- Saved Data/title_akas_chunk_210.csv.gz
- Saved Data/title_akas_chunk_211.csv.gz
- Saved Data/title_akas_chunk_212.csv.gz
- Saved Data/title_akas_chunk_213.csv.gz
- Saved Data/title_akas_chunk_214.csv.gz
- Saved Data/title_akas_chunk_215.csv.gz
- Saved Data/title_akas_chunk_216.csv.gz
- Saved Data/title_akas_chunk_217.csv.gz
- Saved Data/title_akas_chunk_218.csv.gz
- Saved Data/title_akas_chunk_219.csv.gz
- Saved Data/title_akas_chunk_220.csv.gz
- Saved Data/title_akas_chunk_221.csv.gz
- Saved Data/title_akas_chunk_222.csv.gz
- Saved Data/title_akas_chunk_223.csv.gz
- Saved Data/title_akas_chunk_224.csv.gz
- Saved Data/tit

In [25]:
# Using the glob module to now bring the seperated parts together
import glob
q = "Data/title_akas_chunk*.csv.gz"
chunked_files = glob.glob(q)
# Showing the first 5
chunked_files[:5]

['Data\\title_akas_chunk_001.csv.gz',
 'Data\\title_akas_chunk_002.csv.gz',
 'Data\\title_akas_chunk_003.csv.gz',
 'Data\\title_akas_chunk_004.csv.gz',
 'Data\\title_akas_chunk_005.csv.gz']

In [26]:
## Loading all files as df and appending to a list
df_list = []
for file in chunked_files:
    temp_df = pd.read_csv(file, index_col=0)
    df_list.append(temp_df)
    
## Concatenating the list of dfs into 1 combined
df_combined = pd.concat(df_list)
df_combined

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
...,...,...,...,...,...,...,...,...
35771432,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0.0
35771502,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0.0
35771591,tt9916702,1,Loving London: The Playground,US,,,,0.0
35771634,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0.0


In [27]:
## Saving the final combined dataframe
final_fname ='Data/title_akas_combined.csv.gz'
df_combined.to_csv(final_fname, compression='gzip', index=False)

# Data Cleaning

## Cleaning the Basics Dataset

In [28]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9818209 entries, 0 to 9818208
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 674.2+ MB


In [29]:
# Replacing the '\N' with np.nan
basics.replace({'\\N':np.nan}, inplace=True)

In [30]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1329210
endYear           9712152
runtimeMinutes    6923559
genres             442546
dtype: int64

In [31]:
# Dropping movies with missing data from 'runtimeMinutes'
basics.dropna(subset = ['runtimeMinutes'], inplace=True)
# Checking if the change took place
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear          170059
endYear           2843380
runtimeMinutes          0
genres              76570
dtype: int64

In [32]:
# Dropping the movies that are missing values from 'genres'
basics.dropna(subset = ['genres'], inplace=True)
# Checking to see if the change took place
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          165114
endYear           2768397
runtimeMinutes          0
genres                  0
dtype: int64

In [33]:
basics['titleType'].value_counts()

tvEpisode       1430199
short            599806
movie            381716
video            180253
tvMovie           91493
tvSeries          90280
tvSpecial         18073
tvMiniSeries      17143
tvShort            8795
videoGame           322
Name: titleType, dtype: int64

In [34]:
# Keeping only the type 'movie'
basics.query("titleType == 'movie'", inplace=True)
# Checking if the change took place
basics['titleType'].value_counts()

movie    381716
Name: titleType, dtype: int64

In [35]:
basics['startYear'].value_counts()

2017    14365
2018    14322
2019    14056
2016    13951
2015    13477
        ...  
1904        1
1897        1
1896        1
2026        1
1894        1
Name: startYear, Length: 130, dtype: int64

In [36]:
# Dropping nan values for 'startYear'
basics.dropna(subset = ['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           375289
runtimeMinutes         0
genres                 0
dtype: int64

In [37]:
# Changing the type to 'int' in order to process it
basics['startYear'] = basics['startYear'].astype('float')
# Checking to see if the change took place
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375289 entries, 8 to 9818159
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          375289 non-null  object 
 1   titleType       375289 non-null  object 
 2   primaryTitle    375289 non-null  object 
 3   originalTitle   375289 non-null  object 
 4   isAdult         375289 non-null  object 
 5   startYear       375289 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  375289 non-null  object 
 8   genres          375289 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.6+ MB


In [38]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [39]:
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] < 2023)]
basics['startYear'].value_counts()

2018.0    9693
2017.0    9492
2019.0    9446
2016.0    9046
2022.0    8996
2015.0    8632
2021.0    8324
2014.0    8206
2013.0    7824
2020.0    7638
2012.0    7323
2011.0    6790
2010.0    6385
2009.0    5990
2008.0    5230
2007.0    4639
2006.0    4409
2005.0    3919
2004.0    3556
2003.0    3247
2002.0    2995
2001.0    2873
2000.0    2741
Name: startYear, dtype: int64

## Cleaning the Ratings Dataset

In [40]:
# Replacing the '\N' with np.nan
ratings.replace({'\\N':np.nan}, inplace=True)

## Removing Non-US Movies

In [43]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(df_combined['titleId'])
keepers

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9817882     True
9817891     True
9817930    False
9817975     True
9818059    False
Name: tconst, Length: 147394, dtype: bool

In [44]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9817347,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9817742,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9817882,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9817891,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [45]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers2 = ratings['tconst'].isin(df_combined['titleId'])
keepers2

0           True
1           True
2          False
3          False
4           True
           ...  
1307208    False
1307209    False
1307210    False
1307211    False
1307212    False
Name: tconst, Length: 1307213, dtype: bool

In [46]:
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
4,tt0000005,6.2,2609
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1307174,tt9916200,8.1,229
1307175,tt9916204,8.1,262
1307182,tt9916348,8.3,18
1307183,tt9916362,6.4,5312


# Saving New Datasets

In [47]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9817347,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9817742,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9817882,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9817891,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [48]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [49]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
4,tt0000005,6.2,2609
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1307174,tt9916200,8.1,229
1307175,tt9916204,8.1,262
1307182,tt9916348,8.3,18
1307183,tt9916362,6.4,5312


In [50]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

# New Datasets

In [2]:
# Basics
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86594 entries, 0 to 86593
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86594 non-null  object 
 1   titleType       86594 non-null  object 
 2   primaryTitle    86594 non-null  object 
 3   originalTitle   86594 non-null  object 
 4   isAdult         86594 non-null  int64  
 5   startYear       86594 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86594 non-null  int64  
 8   genres          86594 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.9+ MB


In [3]:
# Ratings
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000005,6.2,2609
3,tt0000006,5.2,181
4,tt0000007,5.4,816


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497209 entries, 0 to 497208
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         497209 non-null  object 
 1   averageRating  497209 non-null  float64
 2   numVotes       497209 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


In [3]:
# Akas
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas_combined.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [4]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1434136 entries, 0 to 1434135
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1434136 non-null  object 
 1   ordering         1434136 non-null  int64  
 2   title            1434136 non-null  object 
 3   region           1434136 non-null  object 
 4   language         3907 non-null     object 
 5   types            978363 non-null   object 
 6   attributes       46506 non-null    object 
 7   isOriginalTitle  1432791 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 87.5+ MB
