# Create Project 3

## Load files

In [41]:
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'

In [42]:
title_akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [43]:
title_rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

## import libraries

In [44]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# import SQL

## Read basics and preprocess

In [45]:
# create the pandas df
basics = pd.read_csv(title_basics, sep='\t', low_memory=False)


In [46]:
# look at info
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9722656 entries, 0 to 9722655
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 667.6+ MB


In [47]:
#check the head
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [48]:
# replace null values
basics.replace({'\\N':np.nan}, inplace = True)

In [49]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9722656 entries, 0 to 9722655
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 667.6+ MB


In [50]:
# replace O values
basics.dropna(subset= ['runtimeMinutes', 'genres'], inplace=True)

In [51]:
# keep only 'Movie'
basics = basics.loc[ basics['titleType']=='movie']


In [52]:
# change startYear to a float and check
basics['startYear'] = basics['startYear'].astype(float)
print(basics.dtypes)

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object


In [12]:
# keep only startyear = 2000-2023
#basics.drop(basics[basics['startYear'] < 2000].index, inplace = True)
#basics.head()

In [13]:
#basics.drop(basics[basics['startYear'] > 2001].index, inplace = True)
#basics.head()

In [53]:
# keep only startYear '2000-2022'
basics = basics [(basics['startYear']>=2000) & (basics['startYear']<=2021)]

In [54]:
# keep only documentary
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [55]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138195 entries, 34803 to 9722506
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          138195 non-null  object 
 1   titleType       138195 non-null  object 
 2   primaryTitle    138195 non-null  object 
 3   originalTitle   138195 non-null  object 
 4   isAdult         138195 non-null  object 
 5   startYear       138195 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  138195 non-null  object 
 8   genres          138195 non-null  object 
dtypes: float64(1), object(8)
memory usage: 10.5+ MB


In [56]:
# example making new folder with os
import os
os.makedirs('Data3/',exist_ok=True) 
# Confirm folder created
os.listdir("Data3/")

['title_aka.csv.gz', 'title_basics.csv.gz', 'title_rating.csv.gz']

In [57]:
## Save current dataframe to file.
basics.to_csv("Data3/title_basics.csv.gz",compression='gzip',index=False)

In [58]:
# Open saved file and preview again
basics = pd.read_csv("Data3/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


## Load AKA and preprocess

In [59]:
# create the pandas df
aka = pd.read_csv(title_akas, sep='\t', low_memory=True)


  aka = pd.read_csv(title_akas, sep='\t', low_memory=True)


In [60]:
#look at the column names
aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35368672 entries, 0 to 35368671
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [61]:
aka = aka[aka['region'] == "US"]

In [62]:
aka.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1424761 entries, 5 to 35368416
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1424761 non-null  object
 1   ordering         1424761 non-null  int64 
 2   title            1424761 non-null  object
 3   region           1424761 non-null  object
 4   language         1424761 non-null  object
 5   types            1424761 non-null  object
 6   attributes       1424761 non-null  object
 7   isOriginalTitle  1424761 non-null  object
dtypes: int64(1), object(7)
memory usage: 97.8+ MB


In [63]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(aka['titleId'])
keepers

0          True
1          True
2          True
3         False
4          True
          ...  
138190     True
138191     True
138192    False
138193     True
138194    False
Name: tconst, Length: 138195, dtype: bool

In [64]:
# filter 
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
5,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
138187,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
138189,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
138190,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
138191,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [65]:
# replace null values
aka.replace({'\\N':np.nan}, inplace = True)

In [66]:
## Save current dataframe to file.
aka.to_csv("Data3/title_aka.csv.gz",compression='gzip',index=False)

In [67]:
# Open saved file and preview again
aka = pd.read_csv("Data3/title_aka.csv.gz", low_memory = False)
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


## Load ratings and preprocess


In [68]:
# create the pandas df
rating = pd.read_csv(title_rating, sep='\t', low_memory=False)


In [69]:
# replace null values
rating.replace({'\\N':np.nan}, inplace = True)

In [70]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293889 entries, 0 to 1293888
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1293889 non-null  object 
 1   averageRating  1293889 non-null  float64
 2   numVotes       1293889 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.6+ MB


In [71]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =rating['tconst'].isin(aka['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1293884    False
1293885    False
1293886    False
1293887    False
1293888    False
Name: tconst, Length: 1293889, dtype: bool

In [72]:
# filter 
basics = basics[keepers]
basics

  basics = basics[keepers]


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
5,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
6,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017.0,,80,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...,...,...,...
138182,tt9913872,movie,De la piel del Diablo,De la piel del Diablo,0,2019.0,,75,Thriller
138186,tt9914828,movie,The War of Godzilla,The War of Godzilla,0,2015.0,,102,"Action,Comedy,Family"
138187,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
138189,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"


In [73]:
## Save current dataframe to file.
rating.to_csv("Data3/title_rating.csv.gz",compression='gzip',index=False)

In [74]:
# Open saved file and preview again
rating = pd.read_csv("Data3/title_rating.csv.gz", low_memory = False)
rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
2,tt0000003,6.5,1803
3,tt0000004,5.6,179
4,tt0000005,6.2,2603


# Show the info on each dataframe

In [76]:
#check the info on the basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51728 entries, 0 to 138190
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          51728 non-null  object 
 1   titleType       51728 non-null  object 
 2   primaryTitle    51728 non-null  object 
 3   originalTitle   51728 non-null  object 
 4   isAdult         51728 non-null  int64  
 5   startYear       51728 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  51728 non-null  int64  
 8   genres          51728 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 3.9+ MB


In [78]:
#check the aka info
aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1424761 entries, 0 to 1424760
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1424761 non-null  object 
 1   ordering         1424761 non-null  int64  
 2   title            1424761 non-null  object 
 3   region           1424761 non-null  object 
 4   language         3870 non-null     object 
 5   types            976349 non-null   object 
 6   attributes       46260 non-null    object 
 7   isOriginalTitle  1423416 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 87.0+ MB


In [79]:
# check the rartings info
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293889 entries, 0 to 1293888
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1293889 non-null  object 
 1   averageRating  1293889 non-null  float64
 2   numVotes       1293889 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.6+ MB
