In [1]:
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Datasets

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [6]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [7]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1962
1,tt0000002,5.8,263
2,tt0000003,6.5,1800
3,tt0000004,5.6,179
4,tt0000005,6.2,2601


In [8]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [9]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [10]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Data Cleaning

## Basics Dataset

In [11]:
basics = basics.replace({'\\N':np.nan})

In [12]:
basics = basics.dropna(subset=['runtimeMinutes','genres','startYear'])

In [13]:
basics = basics.loc[basics['titleType'] == 'movie']

In [14]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [15]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

## Akas Dataset

In [16]:
akas = akas.replace({'\\N':np.nan})

In [17]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

8          True
570        True
587        True
672        True
930        True
           ... 
9714085    True
9714094    True
9714133    True
9714178    True
9714262    True
Name: tconst, Length: 283279, dtype: bool

In [18]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45,Drama
...,...,...,...,...,...,...,...,...,...
9714085,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9714094,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9714133,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9714178,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


## Ratings Dataset

In [19]:
ratings = ratings.replace({'\\N':np.nan})

In [20]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(ratings['tconst'])
keepers

8          True
570        True
587        True
672        True
930        True
           ... 
9714085    True
9714094    True
9714133    True
9714178    True
9714262    True
Name: tconst, Length: 282530, dtype: bool

In [21]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45,Drama
...,...,...,...,...,...,...,...,...,...
9714085,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9714094,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9714133,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9714178,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
