# ERD Diagram

![png](Data/Movies_ERD.png)

# Imports

In [3]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote
from sqlalchemy.types import *

# Create the Connection

In [4]:
import json
with open('/Users/dmusl/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [5]:
db_name = "movies"
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movies"
engine = create_engine(connection)
conn = engine.connect()

In [6]:
# Previewing the names of all tables 
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


## Foreign Key-Checks

In [7]:
#Changing the setting for FOREIGN_KEY_CHECKS with the connection
q = """SET @@FOREIGN_KEY_CHECKS=0"""
conn.execute(q)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1f6763399c0>

In [8]:
# Confirm the checks are deactiavated
q = """SELECT @@FOREIGN_KEY_CHECKS"""
pd.read_sql(q,conn)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,0


# Import Title Basics

In [9]:
title_basics = pd.read_csv('Data/title_basics.csv', low_memory=False)
title_basics.info()
title_basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  int64  
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86979 non-null  int64  
 8   genres          86979 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [10]:
# Calculate max string lenghts for object columns
tconst_length = title_basics['tconst'].fillna('').map(len).max()
tconst_length

10

In [11]:
# Calculate max string lenghts for object columns
title_length = title_basics['primaryTitle'].fillna('').map(len).max()
title_length

242

In [16]:
# Checking dataframe's columns
title_basics.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [12]:
q = '''DESCRIBE title_basics;'''
describe = pd.read_sql(q, conn)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,primary_title,varchar(255),YES,,,
2,start_year,float,YES,,,
3,runtime,int,YES,,,
4,created_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
5,updated_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [21]:
# Rename columns to match SQL table
rename_map = {"primaryTitle":"primary_title",
             "startYear":"start_year",
             "runtimeMinutes":"runtime"}
title_basics = title_basics.rename(rename_map,axis=1)
title_basics.head()

Unnamed: 0,tconst,titleType,primary_title,originalTitle,isAdult,start_year,endYear,runtime,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [22]:
# dropping columns to match the data to the table
titles_table = title_basics.drop(columns=['titleType','originalTitle','isAdult','endYear','genres'])

In [23]:
# inserting data
titles_table.to_sql("title_basics",conn,index=False, if_exists='append')

86979

In [31]:
# confirm the data has been added
q = """SELECT * FROM title_basics;"""
pd.read_sql(q,conn)

Unnamed: 0,tconst,primary_title,start_year,runtime,created_at,updated_at
0,tt0035423,Kate & Leopold,2001.0,118,2023-10-26 22:59:07,2023-10-26 22:59:07
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,2023-10-26 22:59:07,2023-10-26 22:59:07
2,tt0069049,The Other Side of the Wind,2018.0,122,2023-10-26 22:59:07,2023-10-26 22:59:07
3,tt0088751,The Naked Monster,2005.0,100,2023-10-26 22:59:07,2023-10-26 22:59:07
4,tt0096056,Crime and Punishment,2002.0,126,2023-10-26 22:59:07,2023-10-26 22:59:07
...,...,...,...,...,...,...
86974,tt9914942,Life Without Sara Amat,2019.0,74,2023-10-26 22:59:11,2023-10-26 22:59:11
86975,tt9915872,The Last White Witch,2019.0,97,2023-10-26 22:59:11,2023-10-26 22:59:11
86976,tt9916170,The Rehearsal,2019.0,51,2023-10-26 22:59:11,2023-10-26 22:59:11
86977,tt9916190,Safeguard,2020.0,95,2023-10-26 22:59:11,2023-10-26 22:59:11


# Import Ratings

In [24]:
ratings = pd.read_csv('Data/titles_ratings.csv', low_memory=False)
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71900 entries, 0 to 71899
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [25]:
# Checking dataframe's columns
ratings.columns

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')

In [26]:
q = '''DESCRIBE ratings;'''
describe = pd.read_sql(q, conn)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,average_rating,float,YES,,,
2,number_of_votes,int,YES,,,
3,updated_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
4,created_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED


In [27]:
# Rename columns to match SQL table
rename_map = {"averageRating":"average_rating",
             "numVotes":"number_of_votes"}
ratings = ratings.rename(rename_map,axis=1)
ratings.head()

Unnamed: 0,tconst,average_rating,number_of_votes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [28]:
# inserting data
ratings.to_sql("ratings",conn,index=False, if_exists='append')

71900

In [30]:
# confirm the data has been added
q = """SELECT * FROM ratings;"""
pd.read_sql(q,conn)

Unnamed: 0,tconst,average_rating,number_of_votes,updated_at,created_at
0,tt0035423,6.4,87153,2023-10-26 23:03:35,2023-10-26 23:03:35
1,tt0062336,6.4,175,2023-10-26 23:03:35,2023-10-26 23:03:35
2,tt0069049,6.7,7754,2023-10-26 23:03:35,2023-10-26 23:03:35
3,tt0088751,5.2,336,2023-10-26 23:03:35,2023-10-26 23:03:35
4,tt0096056,5.6,846,2023-10-26 23:03:35,2023-10-26 23:03:35
...,...,...,...,...,...
71895,tt9914942,6.6,178,2023-10-26 23:03:38,2023-10-26 23:03:38
71896,tt9915872,6.4,9,2023-10-26 23:03:38,2023-10-26 23:03:38
71897,tt9916170,7.0,7,2023-10-26 23:03:38,2023-10-26 23:03:38
71898,tt9916190,3.7,243,2023-10-26 23:03:38,2023-10-26 23:03:38


# Confirm the Database has Been Updated

In [29]:
# Final view to confirm the names of all tables 
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


In [32]:
q = '''DESCRIBE title_basics;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,primary_title,varchar(255),YES,,,
2,start_year,float,YES,,,
3,runtime,int,YES,,,
4,created_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
5,updated_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [33]:
q= '''SELECT * FROM title_basics LIMIT 5;'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,primary_title,start_year,runtime,created_at,updated_at
0,tt0035423,Kate & Leopold,2001.0,118,2023-10-26 22:59:07,2023-10-26 22:59:07
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,2023-10-26 22:59:07,2023-10-26 22:59:07
2,tt0069049,The Other Side of the Wind,2018.0,122,2023-10-26 22:59:07,2023-10-26 22:59:07
3,tt0088751,The Naked Monster,2005.0,100,2023-10-26 22:59:07,2023-10-26 22:59:07
4,tt0096056,Crime and Punishment,2002.0,126,2023-10-26 22:59:07,2023-10-26 22:59:07


In [34]:
q = '''DESCRIBE ratings;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(12),NO,PRI,,
1,average_rating,float,YES,,,
2,number_of_votes,int,YES,,,
3,updated_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
4,created_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED


In [35]:
q= '''SELECT * FROM ratings
LIMIT 5'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,average_rating,number_of_votes,updated_at,created_at
0,tt0035423,6.4,87153,2023-10-26 23:03:35,2023-10-26 23:03:35
1,tt0062336,6.4,175,2023-10-26 23:03:35,2023-10-26 23:03:35
2,tt0069049,6.7,7754,2023-10-26 23:03:35,2023-10-26 23:03:35
3,tt0088751,5.2,336,2023-10-26 23:03:35,2023-10-26 23:03:35
4,tt0096056,5.6,846,2023-10-26 23:03:35,2023-10-26 23:03:35


In [36]:
q = '''DESCRIBE genres;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_id,int,NO,PRI,,auto_increment
1,genre_name,varchar(45),YES,,,
2,created_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
3,updated_at,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [37]:
q= '''SELECT * FROM genres LIMIT 5;'''
pd.read_sql(q, engine)

Unnamed: 0,genre_id,genre_name,created_at,updated_at


In [38]:
q = '''DESCRIBE title_genres;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(45),NO,,,
1,genre_id,int,NO,,,


In [39]:
q= '''SELECT * FROM title_genres LIMIT 5;'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,genre_id
