In [40]:
# Dependencies
import pandas as pd
import numpy as np


In [41]:
# Name of the CSV file
primary_results = 'data\election\primary_results_clean.csv'

In [42]:
# The correct encoding must be used to read the CSV in pandas
# https://stackoverflow.com/questions/23836277/add-leading-zeros-to-strings-in-pandas-dataframe
df_election = pd.read_csv(primary_results, encoding="ISO-8859-1",converters={'fips': '{:0>5}'.format})

In [43]:
# Preview of the DataFrame
df_election.head() 

Unnamed: 0,state,state_abbreviation,county,fips,party,candidate,votes,fraction_votes
0,Alabama,AL,Autauga,1001,Democrat,Bernie Sanders,544,0.182
1,Alabama,AL,Autauga,1001,Democrat,Hillary Clinton,2387,0.8
2,Alabama,AL,Autauga,1001,Republican,Ben Carson,1764,0.146
3,Alabama,AL,Autauga,1001,Republican,Donald Trump,5387,0.445
4,Alabama,AL,Autauga,1001,Republican,John Kasich,421,0.035


In [27]:
import pandas as pd
import sqlalchemy
import psycopg2
import matplotlib as plt
%matplotlib inline

from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:password@localhost:5432/etlproject')
connection = engine.connect()

In [28]:
df_FIPS = pd.read_sql_query('''SELECT * FROM public."FIPS";''', engine)

In [29]:
df_FIPS.rename(columns={'ZIP':'zipcode','COUNTYNAME':'county','STATE':'state','STCOUNTYFP':'fips','CLASSFP':'classfp'},inplace=True)

In [44]:
df_FIPS

Unnamed: 0,zipcode,county,state,fips,classfips
0,36003,Autauga County,AL,01001,H1
1,36006,Autauga County,AL,01001,H1
2,36067,Autauga County,AL,01001,H1
3,36066,Autauga County,AL,01001,H1
4,36703,Autauga County,AL,01001,H1
5,36701,Autauga County,AL,01001,H1
6,36091,Autauga County,AL,01001,H1
7,36051,Autauga County,AL,01001,H1
8,36068,Autauga County,AL,01001,H1
9,36008,Autauga County,AL,01001,H1


In [45]:
df_merge = pd.merge(df_election, df_FIPS, how="left", on=["fips", "fips"])

In [54]:
df_merge.rename(columns={"county_y":"county"})

Unnamed: 0,state_x,state_abbreviation,county_x,fips,party,candidate,votes,fraction_votes,zipcode,county,state_y,classfips
0,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,AL,H1
1,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36006,Autauga County,AL,H1
2,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36067,Autauga County,AL,H1
3,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36066,Autauga County,AL,H1
4,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36703,Autauga County,AL,H1
5,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36701,Autauga County,AL,H1
6,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36091,Autauga County,AL,H1
7,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36051,Autauga County,AL,H1
8,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36068,Autauga County,AL,H1
9,Alabama,AL,Autauga,01001,Democrat,Bernie Sanders,544,0.182,36008,Autauga County,AL,H1


In [58]:
df_merge.drop(['state_x','county_x','state_y','classfips'], axis=1, inplace=True)


In [62]:
df_merge.rename(columns={"county_y":"county"}, inplace=True)

In [63]:
df_merge.head()

Unnamed: 0,state_abbreviation,fips,party,candidate,votes,fraction_votes,zipcode,county
0,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County
1,AL,1001,Democrat,Bernie Sanders,544,0.182,36006,Autauga County
2,AL,1001,Democrat,Bernie Sanders,544,0.182,36067,Autauga County
3,AL,1001,Democrat,Bernie Sanders,544,0.182,36066,Autauga County
4,AL,1001,Democrat,Bernie Sanders,544,0.182,36703,Autauga County


In [64]:
# Name of the CSV file
tax_file = 'data/tax/2014.csv'


In [65]:
# The correct encoding must be used to read the CSV in pandas
df_tax = pd.read_csv(tax_file, encoding="ISO-8859-1",converters={'zipcode': '{:0>5}'.format})

In [66]:
df_tax.head()

Unnamed: 0,statefips,state,zipcode,agi_stub,n1,mars1,mars2,mars4,prep,n2,...,a10300,n85530,a85530,n85300,a85300,n11901,a11901,n11902,a11902,year
0,25,MA,1001,1,2890,2300,290,260,1470,3130,...,1161,0,0,0,0,270,220,2340,3334,2014
1,25,MA,1001,2,2210,1360,490,320,1210,3620,...,5873,0,0,0,0,280,385,1900,4163,2014
2,25,MA,1001,3,1550,750,590,170,930,2940,...,9075,0,0,0,0,300,565,1230,3297,2014
3,25,MA,1001,4,920,240,600,70,560,2130,...,8240,0,0,0,0,200,501,710,2202,2014
4,25,MA,1001,5,1160,190,920,40,770,3000,...,20807,0,0,0,0,380,1548,750,2929,2014


In [70]:
df_merge_tx_clean = df_tax[['a00200','a00100','zipcode']].copy()
df_merge_tx_clean.head()

Unnamed: 0,a00200,a00100,zipcode
0,23034,33609,1001
1,61608,82123,1001
2,73264,95040,1001
3,60764,78961,1001
4,119808,151278,1001


In [71]:
df_final= pd.merge(df_merge, df_merge_tx_clean, how="left", on=["zipcode", "zipcode"])

In [72]:
df_final.head()

Unnamed: 0,state_abbreviation,fips,party,candidate,votes,fraction_votes,zipcode,county,a00200,a00100
0,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,4955.0,5526.0
1,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,5477.0,6288.0
2,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,3990.0,5467.0
3,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,2992.0,4383.0
4,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,5055.0,8231.0


In [73]:
df_final.rename(columns={"a00200":"Salaries","a00100":"Adjusted Gross Income"}, inplace=True)

In [74]:
df_final.head()

Unnamed: 0,state_abbreviation,fips,party,candidate,votes,fraction_votes,zipcode,county,Salaries,Adjusted Gross Income
0,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,4955.0,5526.0
1,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,5477.0,6288.0
2,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,3990.0,5467.0
3,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,2992.0,4383.0
4,AL,1001,Democrat,Bernie Sanders,544,0.182,36003,Autauga County,5055.0,8231.0


In [None]:
df_final.to_excel("output.xlsx") 

In [None]:
df_final.shape

In [None]:
len(df_final)