# IMDB Project

## Part 1: Clean up the data

### Link addresses to data

title akas - https://datasets.imdbws.com/title.akas.tsv.gz

title basics - https://datasets.imdbws.com/title.basics.tsv.gz

title ratings - https://datasets.imdbws.com/title.ratings.tsv.gz

title crew - https://datasets.imdbws.com/title.crew.tsv.gz

title principals - https://datasets.imdbws.com/title.principals.tsv.gz

name basics - https://datasets.imdbws.com/name.basics.tsv.gz

In [1]:
#load necessary libraries
import pandas as pd
import numpy as np

In [2]:
#give the data links respective variables for easy loading
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"

In [3]:
#load each data
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False)
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)
crew = pd.read_csv(title_crew_url, sep='\t', low_memory = False)
principals = pd.read_csv(title_principals_url, sep='\t', low_memory = False)
name_basics = pd.read_csv(name_basics_url, sep='\t',low_memory = False)

In [4]:
#check datasets were loaded
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
#check the datasets were loaded
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [6]:
#check the datasets were loaded
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1934
1,tt0000002,5.8,262
2,tt0000003,6.5,1756
3,tt0000004,5.6,177
4,tt0000005,6.2,2566


In [7]:
#check the datasets were loaded
crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [8]:
#check the datasets were loaded
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [9]:
#check the datasets were loaded
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0045537,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0038355,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0056404,tt0054452,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0077975,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0083922,tt0050986,tt0050976"


#### Title Basics

In [10]:
#replace \n with np.nan for basics data
basics = basics.replace({'\\N':np.nan})

#ensure it worked
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [11]:
#eliminate movies that are null for runtimeMinutes and genres
basics = basics.dropna(subset=['runtimeMinutes', 'genres', 'startYear'])

In [12]:
#keep only titleType==movie
basics = basics.loc[basics['titleType']=='movie']

In [13]:
#keep startYear 2000-2022
basics['startYear'] = basics['startYear'].astype(int)
basics = basics.loc[(basics['startYear'] >=2000) & (basics['startYear'] <=2022)]

In [14]:
#eliminate movies that include 'Documentary' in genre
is_documentary = basics['genres'].str.contains('documentary', case=False)
basics = basics[~is_documentary]

#### Ratings

In [15]:
#replace \n with np.nan for ratings data
ratings = ratings.replace({'\\N':np.nan})

#ensure it worked
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1934
1,tt0000002,5.8,262
2,tt0000003,6.5,1756
3,tt0000004,5.6,177
4,tt0000005,6.2,2566


#### Akas

In [16]:
#replace \n with np. nan for akas data
akas = akas.replace({'\\N':np.nan})

#ensure it worked
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [17]:
#for akas, keep only US entries
akas = akas.loc[akas['region']=='US']

In [18]:
#ensure data is correct
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
34466602,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
34466672,tt9916620,1,The Copeland Case,US,,,,0
34466760,tt9916702,1,Loving London: The Playground,US,,,,0
34466803,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


#### Title Crew

In [19]:
#replace \n with np. nan for title crew data
crew = crew.replace({'\\N':np.nan})

#ensure it worked
crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


#### Title Principals

In [20]:
#replace \n with np.nan for title principals data
principals = principals.replace({'\\N':np.nan})

#ensure it worked
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


#### Name Basics

In [21]:
#replace \n with np.nan for name basics data
name_basics = name_basics.replace({'\\N':np.nan})

#ensure it worked
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0045537,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack","tt0117057,tt0038355,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0056404,tt0054452,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949,1982.0,"actor,soundtrack,writer","tt0078723,tt0072562,tt0077975,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0060827,tt0083922,tt0050986,tt0050976"


### Filter Non-US Movies

In [22]:
#Use akas to filter what we want to keep in basics
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9509313     True
9509322     True
9509361    False
9509406     True
9509490    False
Name: tconst, Length: 146005, dtype: bool

In [23]:
#filter basics and ensure it is correct
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9508777,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9509173,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9509313,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9509322,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [24]:
#Use akas to filter what we want to keep in ratings
keepers = ratings['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1264211    False
1264212    False
1264213    False
1264214    False
1264215    False
Name: tconst, Length: 1264216, dtype: bool

In [25]:
#filter ratings and ensure it is correct
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1934
1,tt0000002,5.8,262
4,tt0000005,6.2,2566
5,tt0000006,5.1,176
6,tt0000007,5.4,805
...,...,...,...
1264190,tt9916200,8.2,220
1264191,tt9916204,8.2,251
1264197,tt9916348,8.5,17
1264198,tt9916362,6.4,5100


In [26]:
#Use akas to filter what we want to keep in title crew
keepers = crew['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
9509635    False
9509636    False
9509637    False
9509638    False
9509639    False
Name: tconst, Length: 9509640, dtype: bool

In [27]:
#filter title crew and ensure it is correct
crew = crew[keepers]
crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
4,tt0000005,nm0005690,
5,tt0000006,nm0005690,
6,tt0000007,"nm0005690,nm0374658",
...,...,...,...
9509501,tt9916560,nm0232902,nm0103043
9509530,tt9916620,,nm7311709
9509568,tt9916702,nm3038589,nm10538223
9509591,tt9916756,nm10538639,nm10538639


In [28]:
#Use akas to filter what we want to keep in title principals
keepers = principals['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2            True
3            True
4            True
            ...  
53995160    False
53995161    False
53995162    False
53995163    False
53995164    False
Name: tconst, Length: 53995165, dtype: bool

In [29]:
#filter title principas and ensure it is correct
principals = principals[keepers]
principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,
...,...,...,...,...,...,...
53994755,tt9916764,5,nm6685122,director,,
53994756,tt9916764,6,nm6687687,writer,written by,
53994757,tt9916764,7,nm10538642,writer,written by,
53994758,tt9916764,8,nm9641593,writer,developed by,


## Creating a 'Data' folder and saving the files

In [30]:
#import os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.DS_Store', '.ipynb_checkpoints']

In [31]:
#save current dataframe to file
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
crew.to_csv("Data/title_crew.csv.gz", compression = 'gzip', index = False)
principals.to_csv("Data/title_principals.csv.gz", compression = "gzip", index = False)
name_basics.to_csv("Data/name_basics.csv.gz", compression = 'gzip', index = False)

In [32]:
#open saved file and preview again
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [33]:
#open saved file and preview again
akas = pd.read_csv('Data/title_akas.csv.gz', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [34]:
#open saved file and preview again
ratings = pd.read_csv('Data/title_ratings.csv.gz', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1934
1,tt0000002,5.8,262
2,tt0000005,6.2,2566
3,tt0000006,5.1,176
4,tt0000007,5.4,805


In [35]:
#open saved file and preview again
crew = pd.read_csv("Data/title_crew.csv.gz", low_memory = False)
crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000005,nm0005690,
3,tt0000006,nm0005690,
4,tt0000007,"nm0005690,nm0374658",


In [36]:
principals = pd.read_csv("Data/title_principals.csv.gz", low_memory = False)
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


In [38]:
name_basics = pd.read_csv("Data/name_basics.csv.gz", low_memory = False)
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0045537,tt0072308"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0117057,tt0038355,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0056404,tt0054452,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0078723,tt0072562,tt0077975,tt0080455"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0060827,tt0083922,tt0050986,tt0050976"
