                Data Engineering for "percentage by state -> deaths and incidents count"

In [2]:
# Dependancies
import pandas as pd
import os
import csv

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base

# Define Declarative Base 
Base = declarative_base()

In [3]:
# Read in CSV File
df=pd.read_csv('db/percentage_by_state.csv', dtype={'Zip': 'str'})
df.head(2)

Unnamed: 0,state,abr,lat,lng,CancerType,incidence,population,percentage_incident,prevalence,alive,year,death_count,prevalence_1_year,percentage_deaths,diff
0,Alabama,AL,32.601011,-86.680736,Female Breast,18227,12434320,0.146586,15,1865148,2011-2015,3394,150272954,0.027295,0
1,Alaska,AK,61.302501,-158.77502,Female Breast,2100,1751388,0.119905,12,21016656,2011-2015,310,120177254,0.0177,0


In [6]:
# Inspecting the Data
df.shape

(50, 15)

In [7]:
# Check columns for missing data
df.isnull().sum()

state                  0
abr                    0
lat                    0
lng                    0
CancerType             0
incidence              0
population             0
percentage_incident    0
prevalence             0
alive                  0
year                   0
death_count            0
prevalence_1_year      0
percentage_deaths      0
diff                   0
dtype: int64

In [8]:
# Get min and max (for leaflet part)
print(df['percentage_deaths'].min())
print(df['percentage_deaths'].max())

0.017700247
0.031030538


In [9]:
# Drop unnessesary columns. NOTE: data has no Missing/NaN values
dropped_df = df.drop(['CancerType', 'prevalence', 'alive', 'year', 'prevalence_1_year', 'diff'], axis=1)
dropped_df.head(2)

Unnamed: 0,state,abr,lat,lng,incidence,population,percentage_incident,death_count,percentage_deaths
0,Alabama,AL,32.601011,-86.680736,18227,12434320,0.146586,3394,0.027295
1,Alaska,AK,61.302501,-158.77502,2100,1751388,0.119905,310,0.0177


In [10]:
# Creating an engine, sqlite database and connection to the engine
engine = create_engine("sqlite:///breast_cancer.sqlite", echo=False)
conn = engine.connect()

In [11]:
# Creating a Class & table 
class Percentage(Base):
    __tablename__ = 'states_percentage_table'
    id = Column(Integer, primary_key=True)
    state = Column(String(255))
    abr = Column(String(255))
    lat = Column(Float)
    lng = Column(Float)
    incidence = Column(Float)
    population = Column(Float)
    percentage_incident = Column(Float)
    death_count = Column(Float)
    percentage_deaths = Column(Float)   

In [12]:
Base.metadata.create_all(conn)

In [11]:
# Transferring the data from dataframe to the sqlite db
dropped_df.to_sql(name='states_percentage_table', con=conn, if_exists='append', index=False)

In [12]:
# Inspecting the Table to make sure the table is there
inspector = inspect(engine)
inspector.get_table_names()

['states_percentage_table']

In [13]:
# Checking the data 
engine.execute('SELECT * FROM states_percentage_table LIMIT 20').fetchall()

[(1, 'Alabama', 'AL', 32.601011199999995, -86.6807365, 18227.0, 12434320.0, 0.14658622300000002, 3394.0, 0.027295421),
 (2, 'Alaska', 'AK', 61.3025006, -158.7750198, 2100.0, 1751388.0, 0.11990489800000001, 310.0, 0.017700247),
 (3, 'Arizona', 'AZ', 34.1682185, -111.930907, 22214.0, 16686987.0, 0.133121695, 3957.0, 0.023713089),
 (4, 'Arkansas', 'AR', 34.7519275, -92.13137840000002, 10399.0, 7527040.0, 0.138155238, 2050.0, 0.027235142),
 (5, 'California', 'CA', 37.2718745, -119.2704153, 128452.0, 96473279.0, 0.13314775, 22033.0, 0.022838447999999997),
 (6, 'Colorado', 'CO', 38.9979339, -105.55056699999999, 17933.0, 13143731.0, 0.136437668, 2796.0, 0.021272499),
 (7, 'Connecticut', 'CT', 41.518783500000005, -72.75750699999999, 16000.0, 9201326.0, 0.17388798100000002, 2288.0, 0.024865981000000002),
 (8, 'Delaware', 'DE', 39.145251, -75.41892059999999, 3994.0, 2387966.0, 0.16725531300000002, 672.0, 0.028141104),
 (9, 'District of C', 'DC', 38.899348700000004, -77.0145666, 2350.0, 1700582.0

In [18]:
# Data Engineering for d3 Chart
# Read in CSV File
df_d3=pd.read_csv('db/usa_rates.csv', dtype={'Zip': 'str'})
df_d3.head(2)

Unnamed: 0,year,incidents,deaths
0,1990,1625524.87,43500
1,1991,1697072.08,43800


In [19]:
# Creating a Class & table 
class Trend(Base):
    __tablename__ = 'trend'
    id = Column(Integer, primary_key=True)
    year = Column(Float)
    incidents = Column(Float)
    deaths = Column(Float)

In [21]:
# Transferring the data from dataframe to the sqlite db
df_d3.to_sql(name='trend', con=conn, if_exists='replace', index=False)

In [25]:
# Inspecting the Table to make sure the table is there
inspector = inspect(engine)
inspector.get_table_names()

['d3_table', 'states_percentage_table', 'trend']

In [24]:
# Check the data
engine.execute('SELECT * FROM trend LIMIT 20').fetchall()

[(1990, 1625524.87, 43500),
 (1991, 1697072.08, 43800),
 (1992, 1763082.42, 44000),
 (1993, 1827096.92, 44600),
 (1994, 1875580.7, 44900),
 (1995, 1915769.55, 45100),
 (1996, 1937757.11, 44700),
 (1997, 1949758.44, 44100),
 (1998, 1956063.33, 43900),
 (1999, 1963266.09, 43800),
 (2000, 1976208.22, 43800),
 (2001, 1990702.93, 43900),
 (2002, 2000845.49, 43900),
 (2003, 2010417.97, 43800),
 (2004, 2017281.06, 43500),
 (2005, 2034279.72, 43200),
 (2006, 2046374.24, 42900),
 (2007, 2055840.48, 42800),
 (2008, 2067915.64, 43100),
 (2009, 2083667.48, 43500)]