# Analysis for household income in Alaska, Florida & Kentuky (example)

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import sqlalchemy as db

The variables we're going to use are: 

* b19001_002: Less than 10,000
* b19001_003: 10,000 to 14,999
* b19001_004: 15,000 to 19,999
* b19001_005: 20,000 to 24,999
* b19001_006: 25,000 to 29,999
* b19001_007: 30,000 to 34,999
* b19001_008: 35,000 to 39,999
* b19001_009: 40,000 to 44,999
* b19001_010: 45,000 to 49,999
* b19001_011: 50,000 to 59,999
* b19001_012: 60,000 to 74,999
* b19001_013: 75,000 to 99,999
* b19001_014: 100,000 to 124,999
* b19001_015: 125,000 to 149,999
* b19001_016: 150,000 to 199,999
* b19001_017: 200,000 or more

<i>All of them span HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) for Households and are in USD</i>

In [3]:
alaska_raw = pd.read_csv('AK.csv')
alaska_raw.head(2)

Unnamed: 0,SummaryLevel,State,StateFIPS,CountyFIPS,PlaceFIPS,CBSACode,CongressDistrict,GEOID,AreaName,B19001_001,...,B19325_093,B19325_094,B19325_095,B19326_001,B19326_002,B19326_003,B19326_004,B19326_005,B19326_006,B19326_007
0,40,AK,2,,,,,04000US02,Alaska,250235,...,2953,3527,2450,31944,40197,61240,19058,25706,47792,13310
1,40,AK,2,,,,,04001US02,Alaska -- Urban,168119,...,1956,2310,1627,33836,40946,59243,19045,27811,48469,14038


For this example, we're going to use the first row for each state and the variables:

* State
* b19001_002
* b19001_003
* b19001_004
* b19001_005
* b19001_006
* b19001_007
* b19001_008
* b19001_009
* b19001_010
* b19001_011
* b19001_012
* b19001_013
* b19001_014
* b19001_015
* b19001_016
* b19001_017

In [4]:
income_cols = ['State', 'B19001_002', 'B19001_003', 'B19001_004', 'B19001_005', 'B19001_006', 'B19001_007', 'B19001_008', 'B19001_009', 'B19001_010', 'B19001_011', 'B19001_012', 'B19001_013', 'B19001_014', 'B19001_015', 'B19001_016', 'B19001_017']

In [5]:
alaska = alaska_raw.loc[[0, 1], income_cols]
alaska.drop([1], inplace=True)
alaska.head()

Unnamed: 0,State,B19001_002,B19001_003,B19001_004,B19001_005,B19001_006,B19001_007,B19001_008,B19001_009,B19001_010,B19001_011,B19001_012,B19001_013,B19001_014,B19001_015,B19001_016,B19001_017
0,AK,9216,8418,8345,9349,8311,9295,9504,10355,8585,17878,26891,36934,28526,19613,22037,16978


Rename cols for better understanding

In [6]:
cols_rename = {
                'B19001_002': '<10k',
                'B19001_003': '10k ~ 14,999',
                'B19001_004': '15k ~ 19,999',
                'B19001_005': '20k ~ 24,999',
                'B19001_006': '25k ~ 29,999',
                'B19001_007': '30k ~ 34,999',
                'B19001_008': '35k ~ 39,999',
                'B19001_009': '40k ~ 44,999',
                'B19001_010': '45k ~ 49,999',
                'B19001_011': '50k ~ 59,999',
                'B19001_012': '60k ~ 74,999',
                'B19001_013': '75k ~ 99,999',
                'B19001_014': '100k ~ 124,999',
                'B19001_015': '125k ~ 149,999',
                'B19001_016': '150k ~ 199,999',
                'B19001_017': '200k+'
                }

In [7]:
alaska.rename(columns = cols_rename, inplace = True)

In [8]:
alaska.head(5)

Unnamed: 0,State,<10k,"10k ~ 14,999","15k ~ 19,999","20k ~ 24,999","25k ~ 29,999","30k ~ 34,999","35k ~ 39,999","40k ~ 44,999","45k ~ 49,999","50k ~ 59,999","60k ~ 74,999","75k ~ 99,999","100k ~ 124,999","125k ~ 149,999","150k ~ 199,999",200k+
0,AK,9216,8418,8345,9349,8311,9295,9504,10355,8585,17878,26891,36934,28526,19613,22037,16978


Apply the transformations to the other tables

In [9]:
# Florida
florida_raw = pd.read_csv('FL.csv')
florida = florida_raw.loc[[0, 1], income_cols]
florida.drop([1], inplace=True)
florida.rename(columns = cols_rename, inplace = True)
florida.head()

Unnamed: 0,State,<10k,"10k ~ 14,999","15k ~ 19,999","20k ~ 24,999","25k ~ 29,999","30k ~ 34,999","35k ~ 39,999","40k ~ 44,999","45k ~ 49,999","50k ~ 59,999","60k ~ 74,999","75k ~ 99,999","100k ~ 124,999","125k ~ 149,999","150k ~ 199,999",200k+
0,FL,556637,398394,423592,445928,421568,416468,391972,379530,331287,612464,738333,832126,527915,297921,297029,322098


In [10]:
# Kentuky
kentuky_raw = pd.read_csv('KY.csv')
kentuky = kentuky_raw.loc[[0, 1], income_cols]
kentuky.drop([1], inplace=True)
kentuky.rename(columns = cols_rename, inplace = True)
kentuky.head()

Unnamed: 0,State,<10k,"10k ~ 14,999","15k ~ 19,999","20k ~ 24,999","25k ~ 29,999","30k ~ 34,999","35k ~ 39,999","40k ~ 44,999","45k ~ 49,999","50k ~ 59,999","60k ~ 74,999","75k ~ 99,999","100k ~ 124,999","125k ~ 149,999","150k ~ 199,999",200k+
0,KY,166777,114927,109055,103977,97375,95769,87724,86355,73833,137027,169183,190731,118183,64316,53630,49355


Concat the tables

In [11]:
states = pd.concat([alaska, florida, kentuky])
states.head()

Unnamed: 0,State,<10k,"10k ~ 14,999","15k ~ 19,999","20k ~ 24,999","25k ~ 29,999","30k ~ 34,999","35k ~ 39,999","40k ~ 44,999","45k ~ 49,999","50k ~ 59,999","60k ~ 74,999","75k ~ 99,999","100k ~ 124,999","125k ~ 149,999","150k ~ 199,999",200k+
0,AK,9216,8418,8345,9349,8311,9295,9504,10355,8585,17878,26891,36934,28526,19613,22037,16978
0,FL,556637,398394,423592,445928,421568,416468,391972,379530,331287,612464,738333,832126,527915,297921,297029,322098
0,KY,166777,114927,109055,103977,97375,95769,87724,86355,73833,137027,169183,190731,118183,64316,53630,49355


Load to the DB

In [None]:
engine = db.create_engine("") # this is not included due to security reasons
engine.connect()

In [None]:
 with engine.begin() as connection:
        states.to_sql('states', con=connection, schema='nVY8WCmoZ0', if_exists='replace', index=False)

Check if everything is ok

In [None]:
states_check = pd.read_sql_table("states", engine)
states_check.head()