# SQL Database: IMDB Movie Data

By Jeffrey Prichard

## SQL visual



## Imports/Settings

In [1]:
import pandas as pd
import numpy as np

import pymysql
pymysql.install_as_MySQLdb()

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

from urllib.parse import quote_plus

## MYSQL Connection

In [2]:
#create connection string using credentions
import json
with open('/Users/jeffrey/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
connection = f"mysql+pymysql://{login['username']}:{login['password']}@localhost/IMDB"

engine = create_engine(connection)

In [4]:
conn = engine.connect()

In [5]:
## Check for database
if database_exists(connection):
    print("It Exists!")
else:
    create_database(connection)
    print("New Database Created.")

It Exists!


# Database

In [6]:
#Show tables: preview
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_imdb
0,genres
1,ratings
2,title_basics
3,title_genres


## Data to Import: CSV's

In [7]:
#basics csv inspection
basics = pd.read_csv('Data/FINAL_title_basics.csv')

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  int64  
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86979 non-null  int64  
 8   genres          86979 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


In [9]:
#ratings csv inspection
ratings = pd.read_csv('Data/FINAL_title_ratings.csv')

ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000005,6.2,2632
3,tt0000006,5.1,182
4,tt0000007,5.4,825


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504000 entries, 0 to 503999
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         504000 non-null  object 
 1   averageRating  504000 non-null  float64
 2   numVotes       504000 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.5+ MB


## title_basics

In [11]:
#Table Information
q = ''' DESC title_basics'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,primary_title,varchar(255),YES,,,
2,start_year,float,YES,,,
3,runtime,int,YES,,,


In [17]:
#subsect our data
title_basics = basics.drop(['titleType', 'originalTitle', 'isAdult', 'endYear', 'genres'], axis=1)
title_basics.columns

Index(['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes'], dtype='object')

In [18]:
#rename columns to match our table names
new_names = {'primaryTitle':'primary_title', 'startYear':'start_year', 'runtimeMinutes':'runtime'}
title_basics = title_basics.rename(new_names, axis=1)

title_basics.columns

Index(['tconst', 'primary_title', 'start_year', 'runtime'], dtype='object')

### inject data

In [20]:
title_basics.to_sql("title_basics", conn, index=False, if_exists='append')

86979

### Confirmation

In [22]:
#Show table: preview
q = '''SELECT *
        FROM title_basics;'''
pd.read_sql(q, conn)

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
2,tt0069049,The Other Side of the Wind,2018.0,122
3,tt0088751,The Naked Monster,2005.0,100
4,tt0096056,Crime and Punishment,2002.0,126
...,...,...,...,...
86974,tt9914942,Life Without Sara Amat,2019.0,74
86975,tt9915872,The Last White Witch,2019.0,97
86976,tt9916170,The Rehearsal,2019.0,51
86977,tt9916190,Safeguard,2020.0,95


## ratings

In [14]:
q = ''' DESC ratings'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,average_rating,float,YES,,,
2,number_of_votes,int,YES,,,


In [23]:
#ratings csv
ratings.columns

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')

In [24]:
#rename columns to match our table names
new_names = {'averageRating':'average_rating', 'numVotes':'number_of_votes'}
ratings = ratings.rename(new_names, axis=1)

ratings.columns

Index(['tconst', 'average_rating', 'number_of_votes'], dtype='object')

### Inject Data

In [26]:
q = '''SELECT @@FOREIGN_KEY_CHECKS'''
pd.read_sql(q, conn)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,1


In [28]:
q = '''SET @@FOREIGN_KEY_CHECKS=0'''
conn.execute(q)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x13ed0cd30>

In [29]:
q = '''SELECT @@FOREIGN_KEY_CHECKS'''
pd.read_sql(q, conn)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,0


In [30]:
ratings.to_sql("ratings", conn, index=False, if_exists='append')

504000

### Confirmation

In [31]:
#Show table: preview
q = '''SELECT *
        FROM ratings;'''
pd.read_sql(q, conn)

Unnamed: 0,tconst,average_rating,number_of_votes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000005,6.2,2632
3,tt0000006,5.1,182
4,tt0000007,5.4,825
...,...,...,...
503995,tt9916200,8.1,231
503996,tt9916204,8.2,264
503997,tt9916348,8.3,18
503998,tt9916362,6.4,5422


## genres

In [15]:
q = ''' DESC genres'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_id,int,NO,PRI,,auto_increment
1,genre_name,varchar(255),YES,,,


## title_genres

In [16]:
q = ''' DESC title_genres'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,genre_id,int,NO,PRI,,


# Final Confirmations

In [32]:
#Show tables: review
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_imdb
0,genres
1,ratings
2,title_basics
3,title_genres


In [33]:
# Ratings Table 
q = '''SELECT *
        FROM ratings
        LIMIT 5;'''
pd.read_sql(q, conn)

Unnamed: 0,tconst,average_rating,number_of_votes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000005,6.2,2632
3,tt0000006,5.1,182
4,tt0000007,5.4,825


In [34]:
# title_basics Table 
q = '''SELECT *
        FROM title_basics
        LIMIT 5;'''
pd.read_sql(q, conn)

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
2,tt0069049,The Other Side of the Wind,2018.0,122
3,tt0088751,The Naked Monster,2005.0,100
4,tt0096056,Crime and Punishment,2002.0,126
