#### In this notebook, the movie data is reworked and new data about crew, director, writers, and actors is included. Then this data will be prepared as SQL tables that can be used in Tableau and Power BI. 

# Create Project

## Load Libraries and Functions

In [1]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


## Load Data

In [2]:
# imdb urls for datasets
names_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"


In [3]:
# loading the data
names = pd.read_csv(names_url, sep='\t', low_memory=False)
crew = pd.read_csv(crew_url, sep='\t', low_memory=False)
principals = pd.read_csv(principals_url, sep='\t', low_memory=False)

# viewing the data
display(names.head(), crew.head(), principals.head())


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


## Get previous data 

### AKAs

In [6]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

akas_df = pd.read_csv(akas_url, sep = "\t", low_memory = False)
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [7]:
# get US movies
akas_filter = akas_df["region"] == "US"

akas_df = akas_df[akas_filter]

akas_df["region"].value_counts()

US    1450671
Name: region, dtype: int64

In [8]:
#Removing all \N values
akas_df = akas_df.replace({"\\N":np.nan})

akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


### Ratings

In [9]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings_df = pd.read_csv(ratings_url, sep = "\t", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
2,tt0000003,6.5,1845
3,tt0000004,5.5,178
4,tt0000005,6.2,2627


In [10]:
#Filtering out non-US ratings
ratings_in_US_filter = ratings_df["tconst"].isin(akas_df["titleId"])

ratings_df = ratings_df[ratings_in_US_filter]
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [11]:
#Removing all \N values
ratings_df = ratings_df.replace({"\\N":np.nan})

ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [12]:
#Checking for duplicated values
ratings_df.duplicated().sum()

0

In [4]:
# also need the basics to work this new data
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

basics_df = pd.read_csv(basics_url, sep = "\t", low_memory = False)
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


### Basics

In [13]:
#Filtering out non-US movies
movies_in_US_filter = basics_df["tconst"].isin(akas_df["titleId"])

basics_df = basics_df[movies_in_US_filter]
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"


In [14]:
#Checking for duplicated values
ratings_df.duplicated().sum()

0

In [18]:
# Removing all \N values
basics_df = basics_df.replace({"\\N": np.nan})

basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100.0,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,,,Drama


In [15]:
#Filtering out non-movies
isMovie = basics_df["titleType"] == "movie"
basics_df = basics_df[isMovie]

basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [19]:
#Changing startYear to be an int
basics_df["startYear"] = basics_df["startYear"].astype(float)

#Filters for movies from 2000-2021 inclusive
isOlderThan2000 = basics_df["startYear"] >= 2000
isYoungerThan2022 = basics_df["startYear"] <= 2021

basics_df = basics_df[isOlderThan2000 & isYoungerThan2022]

#Checking to make sure filters work
basics_df.describe()

Unnamed: 0,startYear,endYear
count,135056.0,0.0
mean,2013.186515,
std,5.608006,
min,2000.0,
25%,2009.0,
50%,2014.0,
75%,2018.0,
max,2021.0,


### Crew

In [21]:
#Filtering out crew that are not in basics
movies_in_basics_filter = crew["tconst"].isin(basics_df["tconst"])

crew = crew[movies_in_basics_filter]
crew.head()

Unnamed: 0,tconst,directors,writers
34803,tt0035423,nm0003506,"nm0737216,nm0003506"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177"
67667,tt0069049,nm0000080,"nm0000080,nm0462648"
86794,tt0088751,"nm0078540,nm0628399",nm0628399
93931,tt0096056,nm0324875,"nm0234502,nm0324875"


In [22]:
# Removing all \N values
crew = crew.replace({"\\N": np.nan})

crew.head()

Unnamed: 0,tconst,directors,writers
34803,tt0035423,nm0003506,"nm0737216,nm0003506"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177"
67667,tt0069049,nm0000080,"nm0000080,nm0462648"
86794,tt0088751,"nm0078540,nm0628399",nm0628399
93931,tt0096056,nm0324875,"nm0234502,nm0324875"


In [24]:
#check for duplicates
crew.duplicated().sum()

0

In [25]:
#Splitting writers and directors into lists
crew["directors_split"] = crew["directors"].str.split(',')
crew["writers_split"] = crew["writers"].str.split(',')
crew.head()

Unnamed: 0,tconst,directors,writers,directors_split,writers_split
34803,tt0035423,nm0003506,"nm0737216,nm0003506",[nm0003506],"[nm0737216, nm0003506]"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177","[nm0749914, nm0765384]","[nm0749914, nm1146177]"
67667,tt0069049,nm0000080,"nm0000080,nm0462648",[nm0000080],"[nm0000080, nm0462648]"
86794,tt0088751,"nm0078540,nm0628399",nm0628399,"[nm0078540, nm0628399]",[nm0628399]
93931,tt0096056,nm0324875,"nm0234502,nm0324875",[nm0324875],"[nm0234502, nm0324875]"


In [26]:
#Removing unnecessary directors and writers columns
crew = crew.drop(columns = ["directors", "writers"])
crew.head()

Unnamed: 0,tconst,directors_split,writers_split
34803,tt0035423,[nm0003506],"[nm0737216, nm0003506]"
61115,tt0062336,"[nm0749914, nm0765384]","[nm0749914, nm1146177]"
67667,tt0069049,[nm0000080],"[nm0000080, nm0462648]"
86794,tt0088751,"[nm0078540, nm0628399]",[nm0628399]
93931,tt0096056,[nm0324875],"[nm0234502, nm0324875]"


In [28]:
#Exploding directors and writers
crew = crew.explode("directors_split")
crew = crew.explode("writers_split")
crew.head()

Unnamed: 0,tconst,directors_split,writers_split
34803,tt0035423,nm0003506,nm0737216
34803,tt0035423,nm0003506,nm0003506
61115,tt0062336,nm0749914,nm0749914
61115,tt0062336,nm0749914,nm1146177
61115,tt0062336,nm0765384,nm0749914


In [29]:
#Renaming directors and writers columns
crew = crew .rename(columns = {"directors_split": "director", "writers_split": "writer"})
crew.head()

Unnamed: 0,tconst,director,writer
34803,tt0035423,nm0003506,nm0737216
34803,tt0035423,nm0003506,nm0003506
61115,tt0062336,nm0749914,nm0749914
61115,tt0062336,nm0749914,nm1146177
61115,tt0062336,nm0765384,nm0749914


In [35]:
#create unique writers and directors
unique_director = crew["director"].unique()
unique_director

unique_writer = crew["writer"].unique()
unique_writer

array(['nm0737216', 'nm0003506', 'nm0749914', ..., 'nm5412267',
       'nm6743460', 'nm3471432'], dtype=object)

### Principals

In [32]:
#Filtering out principals that are not in basics
movies_in_basics_filter = principals["tconst"].isin(basics_df["tconst"])

principals = principals[movies_in_basics_filter]
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
283491,tt0035423,10,nm0107463,editor,\N,\N
283492,tt0035423,1,nm0000212,actress,\N,"[""Kate McKay""]"
283493,tt0035423,2,nm0413168,actor,\N,"[""Leopold""]"
283494,tt0035423,3,nm0000630,actor,\N,"[""Stuart Besser""]"
283495,tt0035423,4,nm0005227,actor,\N,"[""Charlie McKay""]"


In [33]:
# Removing all \N values
principals= principals_df.replace({"\\N": np.nan})

principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
283491,tt0035423,10,nm0107463,editor,,
283492,tt0035423,1,nm0000212,actress,,"[""Kate McKay""]"
283493,tt0035423,2,nm0413168,actor,,"[""Leopold""]"
283494,tt0035423,3,nm0000630,actor,,"[""Stuart Besser""]"
283495,tt0035423,4,nm0005227,actor,,"[""Charlie McKay""]"


In [34]:
#Checking for duplicates
principals_df.duplicated().sum()

0

### Names

In [36]:
#Filtering out crew
names_in_principals_filter = names["nconst"].isin(principals["nconst"])
names_in_directors_filter = names["nconst"].isin(unique_director)
names_in_writers_filter = names["nconst"].isin(unique_writer)

names = names[names_in_principals_filter | 
                    names_in_directors_filter |
                    names_in_writers_filter]
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [38]:
# Removing all \N values
names = names.replace({"\\N": np.nan})

names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982.0,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [39]:
#Checking for duplicates
names.duplicated().sum()

0

# Save Data to file

In [46]:
#rename akas and basics
akas=akas_df
basics=basics_df
ratings=ratings_df

In [47]:
# viewing the data
display(names.info(), crew.info(), principals.info(), akas.info(), 
        ratings.info(), basics.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 592641 entries, 0 to 12675796
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   nconst             592641 non-null  object
 1   primaryName        592641 non-null  object
 2   birthYear          101013 non-null  object
 3   deathYear          13356 non-null   object
 4   primaryProfession  549839 non-null  object
 5   knownForTitles     591626 non-null  object
dtypes: object(6)
memory usage: 31.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 301845 entries, 34803 to 9997605
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    301845 non-null  object
 1   director  297604 non-null  object
 2   writer    272743 non-null  object
dtypes: object(3)
memory usage: 9.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1061963 entries, 283491 to 57095674
Data columns (total 6 colum

None

None

None

None

None

None

In [30]:
#Making data folder if one does not already exist
FOLDER = "New_Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

[]

In [None]:
## Save current dataframes to file.
akas.to_csv("New_Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("New_Data/title_ratings.csv.gz",compression='gzip',index=False)
basics.to_csv("New_Data/title_basics.csv.gz",compression='gzip',index=False)
crew.to_csv("New_Data/title_crew.csv.gz",compression='gzip',index=False)
principals.to_csv("New_Data/title_principals.csv.gz",compression='gzip',index=False)
names.to_csv("New_Data/title_names.csv.gz",compression='gzip',index=False)

## Cleaning the Data

In [4]:
# checking changes
display(basics.head(3), akas.head(3))

# ratings did not appear to have any NaN values
ratings.isna().sum()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0


tconst           0
averageRating    0
numVotes         0
dtype: int64

### Basics: Prepare to Guidelines and Save

#### Guidelines
* [x] Eliminate movies that are null for runtimeMinutes
* [x] Eliminate movies that are null for genre
* [x] keep only titleType==Movie
* [x] keep startYear 2000-2022
* [x] Eliminate movies that include "Documentary" in genre (see tip below)


In [5]:
#load basics
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)


In [6]:
# look at info
basics.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9930711 entries, 0 to 9930710
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.9+ MB


In [7]:
# replace null values
basics.replace({'\\N':np.nan}, inplace = True)

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9930711 entries, 0 to 9930710
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.9+ MB


In [9]:
# 'startYear' is a string column, changing it to an float (to account for NaNs)
basics['startYear'] = basics['startYear'].astype(float)

# confirming
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [10]:
# keep only 'Movie'
basics = basics.loc[ basics['titleType']=='movie']

In [11]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 648205 entries, 8 to 9930661
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          648205 non-null  object 
 1   titleType       648205 non-null  object 
 2   primaryTitle    648205 non-null  object 
 3   originalTitle   648205 non-null  object 
 4   isAdult         648205 non-null  object 
 5   startYear       556822 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  408247 non-null  object 
 8   genres          575898 non-null  object 
dtypes: float64(1), object(8)
memory usage: 49.5+ MB


In [12]:
# filtering out nulls in genres and runtimeMinutes
basics = basics[(basics['genres'].notnull()) &
               (basics['runtimeMinutes'].notnull())
        ].copy()

In [13]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383901 entries, 8 to 9930661
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          383901 non-null  object 
 1   titleType       383901 non-null  object 
 2   primaryTitle    383901 non-null  object 
 3   originalTitle   383901 non-null  object 
 4   isAdult         383901 non-null  object 
 5   startYear       377415 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  383901 non-null  object 
 8   genres          383901 non-null  object 
dtypes: float64(1), object(8)
memory usage: 29.3+ MB


In [14]:
# keep only startYear '2000-2022'
basics = basics [(basics['startYear']>=2000) & (basics['startYear']<=2021)]

In [15]:
# keep only documentary
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [16]:
# creating folder in directory
os.makedirs('Data/',exist_ok=True) 
# confirming folder creation
os.listdir("Data/")

['title_basics.csv.gz']

In [17]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

## AKA load and process
#### Guidelines
- Years from 2020-2022
- Only US movies

In [18]:
# create the pandas df
akas = pd.read_csv(title_akas, sep='\t', low_memory=True)

  akas = pd.read_csv(title_akas, sep='\t', low_memory=True)


In [19]:
# checking info again
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36220558 entries, 0 to 36220557
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [20]:
#  checking region values for inconsistencies
akas['region'].sort_values().unique()

array(['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AN', 'AO', 'AQ', 'AR',
       'AS', 'AT', 'AU', 'AW', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG',
       'BH', 'BI', 'BJ', 'BM', 'BN', 'BO', 'BR', 'BS', 'BT', 'BUMM', 'BW',
       'BY', 'BZ', 'CA', 'CC', 'CD', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL',
       'CM', 'CN', 'CO', 'CR', 'CSHH', 'CSXX', 'CU', 'CV', 'CW', 'CY',
       'CZ', 'DDDE', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG',
       'EH', 'ER', 'ES', 'ET', 'FI', 'FJ', 'FM', 'FO', 'FR', 'GA', 'GB',
       'GD', 'GE', 'GF', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR',
       'GT', 'GU', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE',
       'IL', 'IM', 'IN', 'IQ', 'IR', 'IS', 'IT', 'JE', 'JM', 'JO', 'JP',
       'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KP', 'KR', 'KW', 'KY', 'KZ',
       'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY',
       'MA', 'MC', 'MD', 'ME', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO',
       'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 

- All abbreviations appear consistent

In [21]:
# getting length of US films to compare against for confirmation
display(f"Number akas entries: {len(akas)}  \
        Number of aka US films: {len(akas[akas['region'] == 'US'])}")

'Number akas entries: 36220558          Number of aka US films: 1444985'

In [22]:
#keep only US movies
akas = akas[akas['region'] == 'US'].copy()

# confirming changes
len(akas)

1444985

In [23]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34803       True
42384       True
61115       True
67668       True
86800       True
           ...  
9930384     True
9930393     True
9930432    False
9930477     True
9930561    False
Name: tconst, Length: 138626, dtype: bool

In [24]:
# filter 
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013.0,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86800,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9929849,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9930244,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9930384,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9930393,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [26]:
# replace null values
akas.replace({'\\N':np.nan}, inplace = True)

## Load Ratings and process

In [27]:
# create the pandas df
rating = pd.read_csv(title_rating, sep='\t', low_memory=False)

In [28]:
# replace null values
rating.replace({'\\N':np.nan}, inplace = True)

In [29]:
#look at info
rating.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319592 entries, 0 to 1319591
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1319592 non-null  object 
 1   averageRating  1319592 non-null  float64
 2   numVotes       1319592 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.2+ MB


In [31]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =rating['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1319587    False
1319588    False
1319589    False
1319590    False
1319591    False
Name: tconst, Length: 1319592, dtype: bool

# Save the Data

## Basics

In [32]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [33]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013.0,,120,"Drama,History"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


## AKAs

In [36]:
## Save current dataframe to file.
akas.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)

In [38]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_aka.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


## Ratings

In [39]:
## Save current dataframe to file.
rating.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [40]:
# Open saved file and preview again
rating = pd.read_csv("Data/title_rating.csv.gz", low_memory = False)
rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
2,tt0000003,6.5,1835
3,tt0000004,5.6,179
4,tt0000005,6.2,2624


## Review info

In [41]:
# viewing the data
display(basics.info(), akas.info(), rating.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81762 entries, 0 to 81761
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81762 non-null  object 
 1   titleType       81762 non-null  object 
 2   primaryTitle    81762 non-null  object 
 3   originalTitle   81762 non-null  object 
 4   isAdult         81762 non-null  int64  
 5   startYear       81762 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81762 non-null  int64  
 8   genres          81762 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444985 entries, 0 to 1444984
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1444985 non-null  object 
 1   ordering         1444985 non-null  int64  
 2   title            1444985 n

None

None

None