# Air Quality

In [1]:
import numpy as np
import pandas as pd

In [2]:
fpath = '../../data/airqual/daily_42602_'
cols = ['Date Local','Arithmetic Mean','State Name', 'County Name', 'City Name']

In [3]:
air2010 = pd.read_csv(fpath+'2010.csv', usecols = cols)
air2011 = pd.read_csv(fpath+'2011.csv', usecols = cols)
air2012 = pd.read_csv(fpath+'2012.csv', usecols = cols)
air2013 = pd.read_csv(fpath+'2013.csv', usecols = cols)
air2014 = pd.read_csv(fpath+'2014.csv', usecols = cols)
air2015 = pd.read_csv(fpath+'2015.csv', usecols = cols)
air2016 = pd.read_csv(fpath+'2016.csv', usecols = cols)
air2017 = pd.read_csv(fpath+'2017.csv', usecols = cols)
air2018 = pd.read_csv(fpath+'2018.csv', usecols = cols)
air2019 = pd.read_csv(fpath+'2019.csv', usecols = cols)
air2020 = pd.read_csv(fpath+'2020.csv', usecols = cols)

In [5]:
def getMonthly(air_df,year):
    '''
    The data frames have daily measurements. We will be aggregating these by year.
    '''
    air_df['Date Local'] = pd.to_datetime(air_df['Date Local'])
    air_df['Month'] = air_df['Date Local'].dt.month
    air_df = air_df.groupby(['State Name','County Name', 'City Name','Month'])[['Arithmetic Mean']].agg('mean').reset_index()
    air_df.Month = air_df.Month.astype(str)
    air_df.Month = air_df.Month.str.zfill(2)
    air_df['Meas_Date'] = year+'-'+air_df.Month
    air_df.drop('Month',axis=1,inplace=True)
    return air_df

In [6]:
air2010 = getMonthly(air2010,'2010')
air2011 = getMonthly(air2011,'2011')
air2012 = getMonthly(air2012, '2012')
air2013 = getMonthly(air2013,'2013')
air2014 = getMonthly(air2014,'2014')
air2015 = getMonthly(air2015,'2015')
air2016 = getMonthly(air2016,'2016')
air2017 = getMonthly(air2017,'2017')
air2018 = getMonthly(air2018,'2018')
air2019 = getMonthly(air2019,'2019')
air2020 = getMonthly(air2020,'2020')

In [15]:
air_quality = pd.concat([air2010,air2011,air2012,air2013,air2014,air2015,air2016,air2017,air2018,air2019,air2020])

In [16]:
air_quality.rename(columns={'State Name': 'State', 'County Name': 'County', 'City Name':'City',
                           'Arithmetic Mean':'mean ppb'}, inplace=True)

In [156]:
air_quality.sample()

Unnamed: 0,State,County,City,mean ppb,Meas_Date,year
2552,New York,Bronx,New York,12.315873,2018-07,2018


# Personal Income - Bureau of Economic Analysis
#### 1/ Real personal income for metropolitan areas is personal income divided by the RPPs and the national PCE price index. The result is a chained dollar estimate of real personal income, in thousands of chained 2012 dollars. Calculations are performed on unrounded data.
#### 2/ Real per capita personal income is total real personal income divided by total midyear population

In [124]:
RPI = pd.read_csv('../../data/real_personal_income.csv')
RPI = RPI[RPI.LineCode == 2]

In [125]:
RPI.drop(['GeoFips','LineCode'],axis=1,inplace=True)

In [126]:
RPI['Description'] = 'Real per capita personal income'

In [127]:
RPI = RPI.melt(id_vars = ['MetroArea','Description'],var_name='year', value_name='Personal Income').drop('Description',axis=1)

In [132]:
RPI = RPI.rename(columns={'MetroArea':'City'})

In [133]:
RPI

Unnamed: 0,City,year,Personal Income
0,Austin,2008,44521
1,Miami,2008,44447
2,New York,2008,48585
3,San Francisco,2008,54862
4,Austin,2009,42103
5,Miami,2009,40469
6,New York,2009,46606
7,San Francisco,2009,52251
8,Austin,2010,43390
9,Miami,2010,42729


# CC origination - Consumer Financial Protection Bureau (CFPB)

In [75]:
CC_orig = pd.read_csv('../../data/volume_data_Income_Level_CRC.csv')
CC_orig.drop('month',axis=1,inplace=True)
CC_orig = CC_orig[(CC_orig['income_level_group'] != "High")]
CC_orig = CC_orig[(CC_orig['income_level_group'] != "Middle")]
# CC_orig.date = pd.to_datetime(CC_orig.date)
# CC_orig['Year'] = CC_orig.date.dt.year
# CC_orig = CC_orig[CC_orig['Year'] > 2007]

In [155]:
CC_orig

Unnamed: 0,date,vol,vol_unadj,income_level_group
2,2005-01,4.578106e+09,3.978007e+09,Moderate
3,2005-01,7.635135e+08,6.517910e+08,Low
6,2005-02,4.563908e+09,3.900504e+09,Moderate
7,2005-02,7.558710e+08,6.549358e+08,Low
10,2005-03,4.762458e+09,4.775794e+09,Moderate
...,...,...,...,...
643,2018-05,8.904970e+08,9.429447e+08,Low
646,2018-06,4.398129e+09,4.513570e+09,Moderate
647,2018-06,8.982546e+08,9.167395e+08,Low
650,2018-07,4.220966e+09,4.625639e+09,Moderate


# Census Data

SQL query for retrival:
`WITH acs_2018 AS (
  SELECT *
  FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr`
),

acs_zip AS (
SELECT zip_code
FROM `bigquery-public-data.geo_us_boundaries.zip_codes`
),

acs_zipcode AS (
  SELECT *
  FROM acs_2018 a18
  JOIN acs_zip geo
  ON a18.geo_id = geo.zip_code
) 

SELECT *
FROM acs_zipcode`

In [164]:
acs = pd.read_csv('../../data/census-query.csv',dtype={'zip_code':str})

In [166]:
acs.head()

Unnamed: 0,geo_id,do_date,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,...,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,zip_code
0,87537,2014-01-01,2510,856,1283,1227,42.1,58,75,87,...,,304,171,80,593,20,,,,87537
1,87017,2014-01-01,346,112,150,196,50.9,0,0,12,...,,0,27,15,96,40,,,,87017
2,87528,2014-01-01,3505,727,1758,1747,27.9,260,194,159,...,,293,266,298,1237,138,,,,87528
3,87533,2014-01-01,133,58,49,84,25.8,0,12,8,...,,24,8,7,58,3,,,,87533
4,87511,2014-01-01,2896,787,1177,1719,36.0,142,63,44,...,,263,107,163,1008,279,,,,87511
