# AWS Relational Database Service with Python 

##### Author: Alex Sherman | alsherman@deloitte.com

In [48]:
import os
from IPython.core.display import display, HTML
from configparser import ConfigParser, ExtendedInterpolation

config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

STACKOVERFLOW_ZIP_NAME = config['DATABASES']['STACKOVERFLOW_ZIP_NAME']
STACKOVERFLOW_CSV_NAME = config['DATABASES']['STACKOVERFLOW_CSV_NAME']
DB_PATH = 'mysql://USERNAME:PASSWORD@AWS_ENDPOINT/DB_NAME'

In [49]:
# sqlalchemy imports
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Text, Integer
from sqlalchemy.sql import text
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.engine import reflection
import pandas as pd

### AWS RDS (MySQL)

##### Installation:
- pip install mysqlclient

In [50]:
DB_PATH

'mysql://USERNAME:PASSWORD@AWS_ENDPOINT/DB_NAME'

In [30]:
# instantiate Base
Base = declarative_base()

# define database table
class Sections(Base):
    __tablename__ = 'SECTIONS'

    section_id =   Column(Integer(), primary_key=True, autoincrement=True)
    section_name = Column(Text())
    section_text = Column(Text())

# create database table
engine = create_engine(DB_PATH)
Base.metadata.create_all(engine)

In [43]:
# create a session - connect to db
engine = create_engine(DB_PATH)
Session = sessionmaker(bind=engine)
session = Session()

# create a record
section = Sections(
      section_name = 'first section'
    , section_text = 'text from the first section'
)

# add and commit the record into the database
session.add(section)
session.commit()

# query database to check if data was added
pd.read_sql('SELECT * FROM SECTIONS', con=engine)

Unnamed: 0,section_id,section_name,section_text
0,5,first section,text from the first section
1,6,first section,text from the first section


In [32]:
# Optional - Delete Table
session.execute("DELETE FROM SECTIONS")
session.commit()

### Load Existing Dataset to AWS RDS (MySQL)

In [44]:
# confirm dataset is available
print('ZIP NAME: {} \n'.format(STACKOVERFLOW_ZIP_NAME))
print('CSV NAME: {}'.format(STACKOVERFLOW_CSV_NAME))

ZIP NAME: C:\Users\alsherman\Desktop\PycharmProjects\firm_initiatives\ml_guild\raw_data\stackoverflow.zip 

CSV NAME: Train.csv


In [45]:
Base = declarative_base()

class StackOverflow(Base):
    __tablename__ = 'STACKOVERFLOW'
    Id =    Column(Integer(),  primary_key=True,nullable=False)
    Title = Column(Text(), nullable=True)
    Body =  Column(Text(), nullable=True)
    Tags =  Column(Text(), nullable=True) 

engine = create_engine(DB_PATH)
Base.metadata.create_all(engine)

In [35]:
import pandas as pd
import datetime as dt
import zipfile

start = dt.datetime.now() # set start time
chunksize = 10000  # set number of row to load at a time
engine = create_engine(DB_PATH)  # connect to database
zf = zipfile.ZipFile(STACKOVERFLOW_ZIP_NAME)  # open zipfile

for ind, df in enumerate(pd.read_csv(zf.open(STACKOVERFLOW_CSV_NAME)
    , chunksize=chunksize
    , iterator=True
    , encoding='latin-1'
    , nrows=50000)):
    
    # print metrics (time/row) for populating database
    print('{} seconds: completed {} rows'.format(
        (dt.datetime.now() - start).seconds, ind*chunksize))

    # incrementally load the database
    df.to_sql('STACKOVERFLOW', con=engine, if_exists='append', index=False)

0 seconds: completed 0 rows
20 seconds: completed 10000 rows
39 seconds: completed 20000 rows
57 seconds: completed 30000 rows
77 seconds: completed 40000 rows


In [46]:
# connect to the database
engine = create_engine(DB_PATH)
Session = sessionmaker(bind=engine)
session = Session()

In [47]:
# query 
df = pd.read_sql('SELECT * FROM STACKOVERFLOW', con=engine)
df.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...,php image-processing file-upload upload mime-t...
1,2,How can I prevent firefox from closing when I ...,"<p>In my favorite editor (vim), I regularly us...",firefox
2,3,R Error Invalid type (list) for variable,<p>I am import matlab file and construct a dat...,r matlab machine-learning
3,4,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ...",c# url encoding
4,5,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...,php api file-get-contents


In [22]:
# Optional - Delete Table
query = text('DROP TABLE STACKOVERFLOW')
#session.execute(query)

<sqlalchemy.engine.result.ResultProxy at 0x2d18a425668>

#### Determine the schema of an existing database

In [38]:
# view all TABLES that exist in the database
query = text('SHOW TABLES')
print('TABLES: {} \n'.format(session.execute(query).fetchall()))

# view the field names of a specific table
insp = reflection.Inspector.from_engine(engine)
for col in insp.get_columns('STACKOVERFLOW'):
    print('name: {} | type: {} | nullable: {}'.format(col['name'], col['type'], col['nullable']))

TABLES: [('SECTIONS',), ('STACKOVERFLOW',)] 

name: Id | type: INTEGER(11) | nullable: False
name: Title | type: TEXT | nullable: True
name: Body | type: TEXT | nullable: True
name: Tags | type: TEXT | nullable: True
