In [1]:
# Import dependencies
import pandas as pd
import os

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base

In [4]:
# Pull in original cancer data csv from https://www.gapminder.org/data/ and convert to dataframes
cancer_mortality_df = pd.read_csv(os.path.join('db', 'breast_mortality_global_crude.csv'))
cancer_new_df = pd.read_csv(os.path.join('db', 'breast_new_cases_global_crude.csv'))
cancer_mortality_df

Unnamed: 0,country,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,,,,,,,,,,...,36.40,36.60,36.50,36.40,36.20,36.00,35.80,35.60,35.50,35.40
1,Albania,,,,,,,,,,...,13.40,13.50,13.30,13.00,12.80,12.60,12.40,12.30,12.20,12.00
2,Algeria,,,,,,,,,,...,14.10,14.10,14.20,14.20,14.20,14.30,14.30,14.40,14.40,14.40
3,Andorra,,,,,,,,,,...,18.40,18.50,18.90,18.90,19.10,19.20,19.20,19.30,19.40,19.40
4,Angola,,,,,,,,,,...,20.70,20.60,20.40,20.10,19.90,19.90,19.90,19.80,19.80,19.80
5,Antigua and Barbuda,,,,,,,,,,...,22.00,21.70,21.50,21.70,21.90,22.10,21.90,22.60,22.50,22.50
6,Argentina,,,,,,,,,,...,26.50,25.90,25.60,25.40,25.00,24.80,24.60,24.20,24.00,23.80
7,Armenia,,,,,,,,,,...,29.20,30.20,29.90,29.50,27.90,26.70,24.10,22.80,23.40,22.60
8,Australia,20.1,19.40,21.0,21.0,20.5,20.40,19.70,19.10,19.00,...,19.10,18.80,18.60,18.30,18.00,17.80,17.50,17.50,17.60,17.60
9,Austria,,,,,,15.40,15.00,14.90,15.40,...,21.10,20.80,20.70,20.20,19.90,19.80,19.50,19.20,19.00,18.80


## Clean Mortality Data

In [3]:
# Check columns for missing data
cancer_mortality_df.isnull().sum()

country      0
1950       177
1951       170
1952       168
1953       167
1954       167
1955       159
1956       159
1957       158
1958       158
1959       157
1960       155
1961       151
1962       152
1963       147
1964       146
1965       148
1966       145
1967       145
1968       141
1969       142
1970       143
1971       144
1972       142
1973       142
1974       141
1975       139
1976       141
1977       138
1978       140
          ... 
1987       119
1988       121
1989       121
1990         3
1991         3
1992         2
1993         1
1994         1
1995         1
1996         0
1997         0
1998         0
1999         0
2000         0
2001         0
2002         0
2003         0
2004         0
2005         0
2006         0
2007         0
2008         0
2009         0
2010         0
2011         0
2012         0
2013         0
2014         0
2015         0
2016         0
Length: 68, dtype: int64

In [8]:
# Delete columns for years we have missing data for
cancer_mortality_df = cancer_mortality_df.drop(cancer_mortality_df.loc[:, '1950':'1995'].columns, axis = 1)


KeyError: '1950'

In [5]:
# Verify data are of correct type
cancer_mortality_df.dtypes

country     object
1996       float64
1997       float64
1998       float64
1999       float64
2000       float64
2001       float64
2002       float64
2003       float64
2004       float64
2005       float64
2006       float64
2007       float64
2008       float64
2009       float64
2010       float64
2011       float64
2012       float64
2013       float64
2014       float64
2015       float64
2016       float64
dtype: object

## Clean New Cases data

In [6]:
# Check columns for missing data
cancer_new_df.isnull().sum()

country      1
1953       185
1954       185
1955       185
1956       185
1957       185
1958       183
1959       183
1960       183
1961       183
1962       183
1963       182
1964       182
1965       182
1966       182
1967       182
1968       180
1969       180
1970       180
1971       180
1972       180
1973       180
1974       180
1975       180
1976       180
1977       180
1978       180
1979       180
1980       179
1981       179
          ... 
1987       176
1988       176
1989       176
1990         0
1991         0
1992         0
1993         0
1994         0
1995         0
1996         0
1997         0
1998         0
1999         0
2000         0
2001         0
2002         0
2003         0
2004         0
2005         0
2006         0
2007         0
2008         0
2009         0
2010         0
2011         0
2012         0
2013         0
2014         0
2015         0
2016         0
Length: 65, dtype: int64

In [7]:
# For consistency, delete columns that have missing mortality data
cancer_new_df = cancer_new_df.drop(cancer_new_df.loc[:, '1953':'1995'].columns, axis = 1)

In [8]:
# Verify data are of correct type
cancer_new_df.dtypes

country     object
1996       float64
1997       float64
1998       float64
1999       float64
2000       float64
2001       float64
2002       float64
2003       float64
2004       float64
2005       float64
2006       float64
2007       float64
2008       float64
2009       float64
2010       float64
2011       float64
2012       float64
2013       float64
2014       float64
2015       float64
2016       float64
dtype: object

## Enter data into database

In [9]:
# Define Declarative Base 
Base = declarative_base()

In [10]:
# Creating an engine, sqlite database and connection to the engine
engine = create_engine("sqlite:///cancer.sqlite", echo=False)
conn = engine.connect()

In [11]:
# Create Mortality and Cases Classes
class Mortality(Base):
    __tablename__ = 'world_mortality'
    id = Column(Integer, primary_key=True)
    country = Column(String(255))
    mortality_crude = Column(Float)

class Cases(Base):
    __tablename__ = 'world_cases_new'
    id = Column(Integer, primary_key=True)
    country = Column(String(255))
    new_cases_crude = Column(Float) 

In [12]:
# Create metadata from classes
Base.metadata.create_all(conn)

In [14]:
# Export data from dataframes to tables in sqlite database
cancer_mortality_df.to_sql(name='world_mortality', con=conn, if_exists='replace', index=False)
cancer_new_df.to_sql(name='world_cases_new', con=conn, if_exists='replace', index=False)

In [15]:
# Inspect table to verify existance
inspector = inspect(engine)
inspector.get_table_names()

['world_cases_new', 'world_mortality']

In [16]:
# Inspect data for world_mortality table
engine.execute('SELECT * FROM world_mortality LIMIT 10').fetchall()

[('Afghanistan', 32.1, 32.4, 32.8, 33.3, 33.9, 34.5, 34.7, 35.0, 35.5, 35.9, 36.2, 36.4, 36.6, 36.5, 36.4, 36.2, 36.0, 35.8, 35.6, 35.5, 35.4),
 ('Albania', 9.7, 10.1, 10.6, 11.2, 11.6, 11.9, 12.3, 12.8, 13.1, 13.4, 13.5, 13.4, 13.5, 13.3, 13.0, 12.8, 12.6, 12.4, 12.3, 12.2, 12.0),
 ('Algeria', 12.8, 12.6, 12.4, 12.4, 12.4, 12.4, 12.7, 12.9, 13.3, 13.9, 14.2, 14.1, 14.1, 14.2, 14.2, 14.2, 14.3, 14.3, 14.4, 14.4, 14.4),
 ('Andorra', 23.7, 22.9, 22.1, 21.4, 20.8, 20.1, 19.6, 19.1, 19.3, 18.8, 18.6, 18.4, 18.5, 18.9, 18.9, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4),
 ('Angola', 19.3, 19.1, 20.1, 19.9, 19.8, 19.9, 20.0, 20.4, 20.7, 20.7, 20.9, 20.7, 20.6, 20.4, 20.1, 19.9, 19.9, 19.9, 19.8, 19.8, 19.8),
 ('Antigua and Barbuda', 22.9, 22.8, 22.5, 22.5, 23.2, 23.1, 23.1, 22.9, 22.8, 22.6, 22.3, 22.0, 21.7, 21.5, 21.7, 21.9, 22.1, 21.9, 22.6, 22.5, 22.5),
 ('Argentina', 27.9, 28.0, 27.9, 27.8, 27.7, 27.7, 27.7, 27.8, 27.4, 27.0, 26.7, 26.5, 25.9, 25.6, 25.4, 25.0, 24.8, 24.6, 24.2, 24.0, 23.8),
 ('A

In [17]:
# Inspect data for world_cases_new table
engine.execute('SELECT * FROM world_cases_new LIMIT 10').fetchall()

[('Afghanistan', 30.9, 31.3, 31.6, 32.0, 32.4, 32.7, 33.0, 33.3, 33.6, 34.0, 34.2, 34.3, 34.5, 34.7, 35.0, 34.8, 34.7, 34.6, 34.5, 34.4, 34.4),
 ('Albania', 16.4, 17.0, 17.8, 18.8, 21.1, 21.9, 22.9, 24.1, 25.4, 26.9, 27.3, 27.6, 27.9, 28.3, 28.7, 28.6, 28.6, 28.6, 28.6, 28.6, 28.6),
 ('Algeria', 18.2, 18.1, 18.1, 18.0, 18.0, 18.7, 19.5, 20.3, 21.3, 22.3, 22.9, 23.6, 24.4, 25.2, 26.0, 26.8, 27.6, 28.4, 29.3, 30.2, 31.1),
 ('Andorra', 93.3, 93.1, 92.9, 92.8, 92.7, 92.5, 92.5, 92.4, 92.3, 92.4, 93.0, 93.6, 94.2, 94.9, 95.7, 96.4, 97.3, 98.1, 99.0, 99.9, 101.0),
 ('Angola', 21.8, 21.9, 21.9, 22.0, 22.0, 22.0, 21.9, 21.9, 21.9, 21.9, 21.8, 21.7, 21.6, 21.5, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0),
 ('Antigua and Barbuda', 53.2, 53.6, 54.2, 54.7, 55.3, 55.4, 55.6, 55.8, 56.1, 56.4, 56.9, 57.4, 57.9, 58.4, 59.0, 59.8, 60.7, 61.6, 62.5, 63.4, 64.4),
 ('Argentina', 56.8, 57.5, 58.3, 59.0, 59.8, 60.0, 60.3, 60.6, 60.9, 61.1, 61.3, 61.5, 61.7, 61.9, 62.1, 62.7, 63.3, 63.8, 64.4, 65.0, 65.7),
 (