In [3]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote

# ## Change username and password to match your personal MySQL Server settings
# username = 'root' # default username for MySQL db is root
# password = 'YOUR_PASSWORD' # whatever password you chose during MySQL installation.

# connection = f'mysql+pymysql://{username}:{password}@localhost/sakila'
# engine = create_engine(connection)

In [4]:
import json
with open('/Users/Phuong/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [5]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/moviedb"
engine = create_engine(connection)

In [6]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

It exists!


In [7]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)

Unnamed: 0,Tables_in_moviedb
0,title-akas-us-only


In [32]:
import pandas as pd
import numpy as np

### READ IN TABLES
# us movie table
akas_url = "/Users/Phuong/Documents/GitHub/MySQL-Movie-Database/data/title-akas-us-only.csv"
akas = pd.read_csv(akas_url, low_memory=False)


# title basics table
basics_url = "/Users/Phuong/Documents/GitHub/MySQL-Movie-Database/data/title.basics.tsv.gz"
basics_df = pd.read_csv(basics_url, sep='\t', low_memory=False)

# title ratings table
title_ratings_url = "/Users/Phuong/Documents/GitHub/MySQL-Movie-Database/data/title.ratings.tsv.gz"


In [34]:
### FILTER BASICS TABLES

# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics_df['tconst'].isin(akas['titleId'])
basics_df = basics_df[filter_us_titles]

# remove unwanted values
basics_df = basics_df.replace({'\\N':np.nan})

basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


In [18]:
# drop null values 
basics_df.dropna(subset=["runtimeMinutes", "genres"], inplace=True)
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"


In [28]:
# filter for only movies
filter_movies = basics_df['titleType']== 'movie'
basics_df = basics_df[filter_movies]
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [30]:
basics_df['startYear'].value_counts()

\N      59246
2019     9620
2018     9428
2017     9326
2021     9240
        ...  
1897        2
1894        1
2030        1
1904        1
2031        1
Name: startYear, Length: 136, dtype: int64

In [55]:
# change start year to float type
basics_df['startYear'] = basics_df['startYear'].astype(float)
# filter for year timeframe
start_year_filter = basics_df["startYear"].between(2000, 2022)
basics_df.loc[start_year_filter]
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001.0,,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
39544,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,,6,Short
43546,tt0044326,short,Abstronic,Abstronic,0,2021.0,,6,Short
49493,tt0050396,short,Final Curtain,Final Curtain,0,2012.0,,20,"Horror,Short"


In [80]:
# remove movies that include "Documentary" in the genre
filter_doc = basics_df['genres'].str.contains('Documentary', na=False)
basics_df = basics_df[~filter_doc]

In [81]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790266 entries, 33802 to 10016966
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          790266 non-null  object 
 1   titleType       790266 non-null  object 
 2   primaryTitle    790266 non-null  object 
 3   originalTitle   790266 non-null  object 
 4   isAdult         790266 non-null  object 
 5   startYear       790266 non-null  float64
 6   endYear         24107 non-null   object 
 7   runtimeMinutes  520209 non-null  object 
 8   genres          775498 non-null  object 
dtypes: float64(1), object(8)
memory usage: 60.3+ MB


In [82]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001.0,,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
39544,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,,6,Short
43546,tt0044326,short,Abstronic,Abstronic,0,2021.0,,6,Short
49493,tt0050396,short,Final Curtain,Final Curtain,0,2012.0,,20,"Horror,Short"
