# Get the raw Census data into shape
* move to script when ready
* [Census GEOIDs](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

## Initial cleaning for censusapi dfs

In [None]:
# Downloaded from censusapi
pop = pd.read_csv('../data/Census/population_tract.csv')
housing = pd.read_csv('../data/Census/housing_units_tract.csv')
pub = pd.read_csv('../data/Census/public_assistance_tract.csv') 
agg_pub = pd.read_csv('../data/Census/aggregate_public_assistance_tract.csv') 

In [None]:
# Save all the dfs into a dictionary
censusapi = {'pop': pop, 'housing': housing, 'pub': pub, 'agg_pub': agg_pub}

In [None]:
for key, df in censusapi.items():
    # Convert integers to strings
    for col in ['state', 'county', 'tract']:
        df[col] = df[col].astype(str)
    # Create GEOID, must come out to 11 characters
    df.state = df.state.str.zfill(width = 2)
    df.county = df.county.str.zfill(width = 3)
    df['GEOID'] = df.state + df.county + df.tract
    df.drop(columns = ['state', 'county', 'tract', 'NAME'], inplace = True)
    long_df = df.melt(id_vars = ['GEOID', 'year'], var_name = 'variable')
    censusapi[key] = long_df

In [None]:
# Write all the censusapi dfs into 1 df
long = pd.DataFrame()

for key, df in censusapi.items():
    key = pd.DataFrame(df)
    long = long.append(key)

## Initial cleaning for tidycensus dfs

In [None]:
# Downloaded from tidycensus
emp = pd.read_csv('../data/Census/employment_tract.csv') 
income = pd.read_csv('../data/Census/income_tract.csv') 
edu = pd.read_csv('../data/Census/educational_attainment_tract.csv') 
pov = pd.read_csv('../data/Census/poverty_tract.csv') 
pov_fam = pd.read_csv('../data/Census/poverty_families_tract.csv')
food = pd.read_csv('../data/Census/food_stamps_tract.csv') 

In [None]:
# Save all the dfs into a dictionary
tidycensus = {'emp': emp, 'income': income, 'edu': edu, 'pov': pov, 
              'pov_fam': pov_fam, 'food': food}

for key, df in tidycensus.items():
    df.GEOID = df.GEOID.astype(str)
    df.GEOID = df.GEOID.str.zfill(11)
    long_df = df.melt(id_vars = ['GEOID', 'NAME', 'year', 'variable'], var_name = ['type'], value_vars = ['estimate', 'moe'])
    long_df['type'].replace({'estimate': 'E', 'moe': 'M'}, inplace = True)
    long_df['variable'] = long_df['variable'] + long_df['type']
    long_df.drop(columns = ['NAME', 'type'], inplace = True)
    tidycensus[key] = long_df

In [None]:
# Write all the censusapi dfs into 1 df
long2 = pd.DataFrame()

for key, df in tidycensus.items():
    key = pd.DataFrame(df)
    long2 = long2.append(key)

## Append dfs together & export as parquet

In [None]:
all = long.append(long2)
all.head()

In [None]:
all.to_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')

## Clean up long df
* Keep long, easier to rename what the variable is

In [2]:
df = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')

In [47]:
#df.to_stata('../data/raw_census.dta')

In [3]:
# Generate column that identifies the table it belongs to
table = []

for row in df.variable:
    if row.find('B01003')==0:
        table.append('pop')
    elif row.find('B25001')==0:
        table.append('housing')
    elif row.find('S1903')==0:
        table.append('income')
    elif row.find('S1501')==0:
        table.append('edu')
    elif row.find('S1701')==0:
        table.append('pov')
    elif row.find('S1702')==0:
        table.append('povfam')
    elif row.find('S2201')==0:
        table.append('food')
    elif row.find('S2301')==0:
        table.append('emp')
    elif row.find('B19058')==0:
        table.append('pubassist')
    elif row.find('B19067')==0:
        table.append('aggpubassist')
    else:
        table.append('NA')

df['table'] = table

In [4]:
# Generate column that identifies whether it's estimate or margin of error (might drop margin of error later)
est_moe = []

for row in df.variable:
    if row.find('E') != -1:
        est_moe.append('')
    elif row.find('M') != -1:
        est_moe.append('_moe')

df['est_moe'] = est_moe

In [5]:
df.head()

Unnamed: 0,GEOID,year,variable,value,table,est_moe
0,6037141400,2010,B01003_001E,4493.0,pop,
1,6037141500,2010,B01003_001E,2715.0,pop,
2,6037141600,2010,B01003_001E,3693.0,pop,
3,6037141700,2010,B01003_001E,3002.0,pop,
4,6037143100,2010,B01003_001E,3900.0,pop,


In [8]:
df.table.value_counts()

food            2153628
pov             1266840
emp              394128
income            93840
edu               93840
pubassist         75072
pop               37536
housing           37536
aggpubassist      37536
povfam            28152
Name: table, dtype: int64

In [44]:
df[df.table=='housing'].variable.value_counts()

B25001_001M    18768
B25001_001E    18768
Name: variable, dtype: int64

In [42]:
def pop_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_pop'
    else:
        return 'NA'

In [None]:
def housing_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_housing'
    else:
        return 'NA'

In [43]:
def emp_vars(row):
    emp_vars = []
    if row.variable.find('_C01') != -1:
        return 'pop16'
    elif row.variable.find('_C02') != -1:
        return 'lf16'
    elif row.variable.find('_C03') != -1:
        return 'emp16'
    elif row.variable.find('_C04') != -1:
        return 'unemprate16'
    else:
        return 'NA'

In [None]:
def emp_var2(row):
    emp_var2 = []
    if row.variable.find('_001') != -1:
        return 'tot_hh'
    elif row.variable.find('_009') != -1:
        return 'tot_1race'
    elif row.variable.find('_010') != -1:
        return 'tot_white'
    elif row.variable.find('_011') != -1:
        return 'tot_black'
    elif row.variable.find('_012') != -1:
        return 'tot_amerind'
    elif row.variable.find('_013') != -1:
        return 'tot_asian'
    elif row.variable.find('_014') != -1:
        return 'tot_pacis'
    elif row.variable.find('_015') != -1:
        return 'tot_other'
    elif row.variable.find('_016') != -1:
        return 'tot_2race'
    elif row.variable.find('_017') != -1:
        return 'tot_hisp'
    elif row.variable.find('_018') != -1:
        return 'tot_nonhisp'
    elif row.variable.find('_019') != -1:
        return 'tot_pop20'
    elif row.variable.find('_020') != -1:
        return 'tot_male'
    elif row.variable.find('_021') != -1:
        return 'tot_female'
    elif row.variable.find('_022') != -1:
        return 'tot_femalewchild'
    elif row.variable.find('_023') != -1:
        return 'tot_pov'
    elif row.variable.find('_025') != -1:
        return 'tot_pop25'
    elif row.variable.find('_026') != -1:
        return 'tot_lesshs'
    elif row.variable.find('_027') != -1:
        return 'tot_hs'
    elif row.variable.find('_028') != -1:
        return 'tot_college'
    elif row.variable.find('_029') != -1:
        return 'tot_ba'

In [45]:
df[df.table=='emp'].variable.value_counts()

S2301_C01_023M    2346
S2301_C01_014E    2346
S2301_C03_025M    2346
S2301_C04_010M    2346
S2301_C04_016M    2346
                  ... 
S2301_C01_001E    2346
S2301_C02_022M    2346
S2301_C01_021E    2346
S2301_C03_029M    2346
S2301_C03_018E    2346
Name: variable, Length: 168, dtype: int64

In [37]:
def pick_table(row):
    if row.table=='pop':
        return pop_vars(row)
    elif row.table=='housing':
        return housing_vars(row)
    elif row.table=='emp':
        return emp_vars(row)


df['var2'] = df.apply(pick_table, axis = 1)

In [48]:
df[(df.table=='emp')].head(10)

Unnamed: 0,GEOID,year,variable,value,table,est_moe,var2
9,6037101110,2011,S2301_C01_001E,4037.0,emp,,pop16
10,6037101110,2011,S2301_C01_009E,4028.0,emp,,pop16
11,6037101110,2011,S2301_C01_010E,3049.0,emp,,pop16
12,6037101110,2011,S2301_C01_011E,50.0,emp,,pop16
13,6037101110,2011,S2301_C01_012E,36.0,emp,,pop16
14,6037101110,2011,S2301_C01_013E,505.0,emp,,pop16
15,6037101110,2011,S2301_C01_014E,0.0,emp,,pop16
16,6037101110,2011,S2301_C01_015E,388.0,emp,,pop16
17,6037101110,2011,S2301_C01_016E,9.0,emp,,pop16
18,6037101110,2011,S2301_C01_017E,759.0,emp,,pop16


In [None]:
df['var'] = df.table + df.est_moe