# Movies - Part 3 -- MySQL Database
- Amber Kutscher
- Data Enrichment

In [1]:
# Imports
import pandas as pd
import json
from sqlalchemy import create_engine, String, Text, Float, Integer, text
from urllib.parse import quote_plus as urlquote

In [2]:
with open (r"\Users\amber\.secret\tmdb_api.json") as f:
    login = json.load(f)

# Display the keys of the loaded dict
login.keys()

dict_keys(['API Key'])

In [3]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['API Key']

In [4]:
# Load cleaned datasets
df_basics = pd.read_csv('Data/title_basics.csv.gz', encoding='utf-8', low_memory=False)
df_ratings = pd.read_csv('Data/title_ratings.csv.gz', encoding='utf-8', low_memory=False)

In [5]:
# Load the JSON files into a Pandas dataframe
df_api_2000 = pd.read_json('Data/tmdb_api_results_2000.json')
df_api_2001 = pd.read_json('Data/tmdb_api_results_2001.json')

In [30]:
df_api_2000.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.45,10.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.109,2331.0,PG


## Normalize Genres

In [6]:
# Create a new column with the single-string genres as a list of strings
df_basics['genres_split'] = df_basics['genres'].str.split(',')
df_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,[Drama]
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama,[Drama]
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama,[Drama]
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]"
...,...,...,...,...,...,...,...,...,...,...
114443,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019.0,,70,Documentary,[Documentary]
114444,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,,97,"Comedy,Drama,Fantasy","[Comedy, Drama, Fantasy]"
114445,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama,[Drama]
114446,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller","[Action, Adventure, Thriller]"


In [7]:
# Use .explode() to separate the list of genres into new rows
exploded_genres = df_basics.explode('genres_split')
exploded_genres

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Fantasy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Romance
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama,Drama
...,...,...,...,...,...,...,...,...,...,...
114446,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller",Action
114446,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller",Adventure
114446,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller",Thriller
114447,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History",Drama


In [8]:
# Use .unique() to get the unique genres from the genres_split column
unique_genres = sorted(exploded_genres['genres_split'].unique())
unique_genres

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [9]:
# Save just tconst and genres_split as a new dataframe
title_genres = exploded_genres[['tconst', 'genres_split']].copy()
title_genres.head()

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0068865,Drama


In [10]:
# Make a dictionary with list of unique genres as the key and the new integer id as values
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Documentary': 7,
 'Drama': 8,
 'Family': 9,
 'Fantasy': 10,
 'Game-Show': 11,
 'History': 12,
 'Horror': 13,
 'Music': 14,
 'Musical': 15,
 'Mystery': 16,
 'News': 17,
 'Reality-TV': 18,
 'Romance': 19,
 'Sci-Fi': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [11]:
# Replace the string genres in title_genres with the new integer ids
title_genres['genre_id'] = title_genres['genres_split'].map(genre_map)
title_genres = title_genres.drop(columns='genres_split')

In [12]:
# Verifying that the above code was applied correctly
title_genres.head()

Unnamed: 0,tconst,genre_id
0,tt0035423,5
0,tt0035423,10
0,tt0035423,19
1,tt0062336,8
2,tt0068865,8


In [13]:
# Create a genres dataframe
genres = pd.DataFrame({'Genre_Name': genre_map.keys(), 
                       'Genre_ID': genre_map.values()})
genres.head()

Unnamed: 0,Genre_Name,Genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


In [14]:
# Check the dtypes of the df_basics dataframe
df_basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult             int64
startYear         float64
endYear           float64
runtimeMinutes      int64
genres             object
genres_split       object
dtype: object

In [15]:
# Get the max string length for each column
max_str_len = {}

for column in df_basics.columns:
    max_len = df_basics[column].fillna('').astype(str).apply(len).max()
    max_str_len[column] = max_len

print(max_str_len)

{'tconst': 10, 'titleType': 5, 'primaryTitle': 242, 'originalTitle': 242, 'isAdult': 1, 'startYear': 6, 'endYear': 0, 'runtimeMinutes': 5, 'genres': 32, 'genres_split': 42}


In [16]:
# Calculate max string lengths for object columns
key_len = df_basics['tconst'].fillna('').map(len).max()
title_len = df_basics['primaryTitle'].fillna('').map(len).max()

In [17]:
from sqlalchemy.types import *

In [18]:
# Create a schema dictionary using SQLAlchemy datatype objects
df_schema = {
    "tconst": String(key_len+1),
    "primaryTitle": Text(title_len+1),
    'startYear': Float(),
    'endYear': Float(),
    'runtimeMinutes': Integer()}

In [19]:
# Drop unnecessary columns from df_basics
df_basics.drop(columns=['originalTitle', 'isAdult', 'titleType', 'genres', 'genres_split'], inplace=True)

In [20]:
# Create sqlite engine
engine = create_engine('sqlite:///movies.db', echo=True)
# Create connection to engine
conn = engine.connect()

In [21]:
# Save to sql with dtype and index=False
df_basics.to_sql('title_basics', conn, if_exists='replace', index=False, dtype=df_schema)

2023-10-03 13:05:27,798 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("title_basics")
2023-10-03 13:05:27,800 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-10-03 13:05:27,801 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("title_basics")
2023-10-03 13:05:27,802 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-10-03 13:05:27,803 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2023-10-03 13:05:27,803 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-10-03 13:05:27,804 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("title_basics")
2023-10-03 13:05:27,804 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-10-03 13:05:27,806 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2023-10-03 13:05:27,807 INFO sqlalchemy.engine.Engine [raw sql] ('title_basics',)
2023-10-03 13:05:27,808 INFO sqlalchemy.engine.Engine PRAGMA main.foreig

114448

In [22]:
# Create "title_ratings" table with tconst as primary key
create_table_sql_ratings = """
CREATE TABLE title_ratings (
    tconst TEXT PRIMARY KEY,
    averageRating FLOAT,
    numVotes INTEGER
);
"""

# Execute the SQL statement to create the "title_ratings" table
conn.execute(create_table_sql_ratings)

2023-10-03 13:05:28,382 INFO sqlalchemy.engine.Engine 
CREATE TABLE title_ratings (
    tconst TEXT PRIMARY KEY,
    averageRating FLOAT,
    numVotes INTEGER
);

2023-10-03 13:05:28,383 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-10-03 13:05:28,383 INFO sqlalchemy.engine.Engine ROLLBACK


OperationalError: (sqlite3.OperationalError) table title_ratings already exists
[SQL: 
CREATE TABLE title_ratings (
    tconst TEXT PRIMARY KEY,
    averageRating FLOAT,
    numVotes INTEGER
);
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [23]:
# Show the first 5 rows of title_basics
query_show_basics = """
SELECT *
FROM title_basics
LIMIT 5;"""

result_basics = conn.execute(query_show_basics)
for row in result_basics:
    print(row)

2023-10-03 13:06:25,770 INFO sqlalchemy.engine.Engine 
SELECT *
FROM title_basics
LIMIT 5;
2023-10-03 13:06:25,771 INFO sqlalchemy.engine.Engine [raw sql] ()
('tt0035423', 'Kate & Leopold', 2001.0, None, 118)
('tt0062336', 'The Tango of the Widower and Its Distorting Mirror', 2020.0, None, 70)
('tt0068865', 'Lives of Performers', 2016.0, None, 90)
('tt0069049', 'The Other Side of the Wind', 2018.0, None, 122)
('tt0088751', 'The Naked Monster', 2005.0, None, 100)


In [26]:
# Show the first 5 rows of title_ratings
query_show_ratings = """
SELECT *
FROM title_ratings
LIMIT 5;
"""

result_ratings = conn.execute(query_show_ratings)
for row in result_ratings:
    print(row)

2023-10-03 13:09:12,341 INFO sqlalchemy.engine.Engine 
SELECT *
FROM title_ratings
LIMIT 5;

2023-10-03 13:09:12,341 INFO sqlalchemy.engine.Engine [raw sql] ()


In [27]:
# SQL query to list all tables in the database
query_show_tables = """
SELECT name
FROM sqlite_master
WHERE type='table';
"""

In [28]:
# Execute the SQL query to list tables
result_tables = conn.execute(query_show_tables)

# Display the table names
for row in result_tables:
    print(row[0])

2023-10-03 13:10:14,117 INFO sqlalchemy.engine.Engine 
SELECT name
FROM sqlite_master
WHERE type='table';

2023-10-03 13:10:14,119 INFO sqlalchemy.engine.Engine [raw sql] ()
title_ratings
title_basics
