# Imports

In [2]:
# Imports 
import pandas as pd
import numpy as np

# Data Dictionary

**IMDb Datasets**

Subsets of IMDb data are available for access to customers for personal and non-commercial use. You can hold local copies of this data, and it is subject to our terms and conditions. Please refer to the Non-Commercial Licensing and copyright/license and verify compliance.

**Data Location**

The dataset files can be accessed and downloaded from https://datasets.imdbws.com/. The data is refreshed daily.

**IMDb Dataset Details**

Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. The first line in each file contains headers that describe what is in each column. A ‘\N’ is used to denote that a particular field is missing or null for that title/name. The available datasets are as follows:

**title.akas.tsv.gz** - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title

- ordering (integer) – a number to uniquely identify rows for a given titleId

- title (string) – the localized title

- region (string) - the region for this version of the title

- language (string) - the language of the title

- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning

- attributes (array) - Additional terms to describe this alternative title, not enumerated

- isOriginalTitle (boolean) – 0: not original title; 1: original title

**title.basics.tsv.gz** - Contains the following information for titles:

- tconst (string) - alphanumeric unique identifier of the title

- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)

- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release

- originalTitle (string) - original title, in the original language

- isAdult (boolean) - 0: non-adult title; 1: adult title

- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year

- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types

- runtimeMinutes – primary runtime of the title, in minutes

- genres (string array) – includes up to three genres associated with the title

**title.ratings.tsv.gz** – Contains the IMDb rating and votes information for titles
- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received

# Loading the Data

In [3]:
# URLs for each dataframe needed
#basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
#akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
#ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
# Loading the data for basics
# basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [4]:
# Loading the Data for akas
# akas = pd.read_csv(akas_url, sep='\t', low_memory=True)
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [5]:
# Loading the data for ratings
# ratings = pd.read_csv(ratings_url, sep='\t', low_memory=True)
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1810
3,tt0000004,5.6,178
4,tt0000005,6.2,2610


# Data Cleaning

## Cleaning the Basics Dataset

In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9811401 entries, 0 to 9811400
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 673.7+ MB


In [6]:
# Replacing the '\N' with np.nan
basics.replace({'\\N':np.nan}, inplace=True)

In [7]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1328975
endYear           9705443
runtimeMinutes    6920490
genres             442231
dtype: int64

In [8]:
# Dropping movies with missing data from 'runtimeMinutes'
basics.dropna(subset = ['runtimeMinutes'], inplace=True)
# Checking if the change took place
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear          169689
endYear           2839666
runtimeMinutes          0
genres              76580
dtype: int64

In [9]:
# Dropping the movies that are missing values from 'genres'
basics.dropna(subset = ['genres'], inplace=True)
# Checking to see if the change took place
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          164742
endYear           2764674
runtimeMinutes          0
genres                  0
dtype: int64

In [10]:
basics['titleType'].value_counts()

tvEpisode       1427044
short            599511
movie            381585
video            180194
tvMovie           91444
tvSeries          90243
tvSpecial         18062
tvMiniSeries      17134
tvShort            8792
videoGame           322
Name: titleType, dtype: int64

In [11]:
# Keeping only the type 'movie'
basics.query("titleType == 'movie'", inplace=True)
# Checking if the change took place
basics['titleType'].value_counts()

movie    381585
Name: titleType, dtype: int64

In [12]:
basics['startYear'].value_counts()

2017    14367
2018    14322
2019    14056
2016    13950
2015    13476
        ...  
1904        1
1897        1
1896        1
2026        1
1894        1
Name: startYear, Length: 130, dtype: int64

In [13]:
# Dropping nan values for 'startYear'
basics.dropna(subset = ['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           375163
runtimeMinutes         0
genres                 0
dtype: int64

In [14]:
# Changing the type to 'int' in order to process it
basics['startYear'] = basics['startYear'].astype('float')
# Checking to see if the change took place
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375163 entries, 8 to 9811351
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          375163 non-null  object 
 1   titleType       375163 non-null  object 
 2   primaryTitle    375163 non-null  object 
 3   originalTitle   375163 non-null  object 
 4   isAdult         375163 non-null  object 
 5   startYear       375163 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  375163 non-null  object 
 8   genres          375163 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.6+ MB


In [15]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

8          True
144        True
570        True
587        True
672        True
           ... 
9811167    True
9811251    True
9811292    True
9811319    True
9811351    True
Name: tconst, Length: 375163, dtype: bool

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9811167,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
9811251,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama
9811292,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015.0,,57,Documentary
9811319,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,,100,Documentary


In [17]:
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] < 2023)]
basics['startYear'].value_counts()

2017.0    14284
2018.0    14259
2019.0    13988
2016.0    13845
2015.0    13316
2014.0    12858
2022.0    12688
2013.0    12340
2021.0    12272
2012.0    11596
2020.0    11515
2011.0    10758
2010.0    10177
2009.0     9329
2008.0     8130
2007.0     6940
2006.0     6497
2005.0     5815
2004.0     5193
2003.0     4577
2002.0     4126
2001.0     3854
2000.0     3634
Name: startYear, dtype: int64

In [18]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

## Cleaning the Ratings Dataset

In [19]:
# Replacing the '\N' with np.nan
ratings.replace({'\\N':np.nan}, inplace=True)

In [20]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2           True
3           True
4           True
           ...  
1306108     True
1306109     True
1306110    False
1306111    False
1306112    False
Name: tconst, Length: 1306113, dtype: bool

In [21]:
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1810
3,tt0000004,5.6,178
4,tt0000005,6.2,2610
...,...,...,...
1306090,tt9916460,9.4,18
1306094,tt9916538,8.6,7
1306095,tt9916544,6.9,62
1306108,tt9916730,8.3,10


## Cleaning the Akas Dataset

In [22]:
# Replacing the '\N' with np.nan
akas.replace({'\\N':np.nan}, inplace=True)

In [23]:
# Checking the values of 'region'
akas['region'].value_counts()

DE    4280021
FR    4275714
JP    4274508
IN    4215414
ES    4195962
       ...   
JE          2
NU          1
TV          1
PW          1
NR          1
Name: region, Length: 247, dtype: int64

In [24]:
# Limiting to just movies in the US
akas = akas[(akas['region'] == 'US')]
akas['region'].value_counts()

US    1433173
Name: region, dtype: int64

# Saving New Datasets

In [25]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9811074,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9811083,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9811122,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9811167,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [26]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [27]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1810
3,tt0000004,5.6,178
4,tt0000005,6.2,2610
...,...,...,...
1306090,tt9916460,9.4,18
1306094,tt9916538,8.6,7
1306095,tt9916544,6.9,62
1306108,tt9916730,8.3,10


In [28]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [29]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35743767,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35743837,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35743926,tt9916702,1,Loving London: The Playground,US,,,,0
35743969,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [30]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)