# Get the raw Census data into shape
* move to script when ready
* [Census GEOIDs](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

## Initial cleaning for censusapi dfs

In [2]:
# Downloaded from censusapi
pop = pd.read_csv('../data/Census/population_tract.csv')
housing = pd.read_csv('../data/Census/housing_units_tract.csv')
pub = pd.read_csv('../data/Census/public_assistance_tract.csv') 
agg_pub = pd.read_csv('../data/Census/aggregate_public_assistance_tract.csv') 

In [3]:
# Save all the dfs into a dictionary
censusapi = {'pop': pop, 'housing': housing, 'pub': pub, 'agg_pub': agg_pub}

In [4]:
for key, df in censusapi.items():
    # Convert integers to strings
    for col in ['state', 'county', 'tract']:
        df[col] = df[col].astype(str)
    # Create GEOID, must come out to 11 characters
    df.state = df.state.str.zfill(width = 2)
    df.county = df.county.str.zfill(width = 3)
    df['GEOID'] = df.state + df.county + df.tract
    df.drop(columns = ['state', 'county', 'tract', 'NAME'], inplace = True)
    long_df = df.melt(id_vars = ['GEOID', 'year'], var_name = 'variable')
    censusapi[key] = long_df

In [5]:
# Write all the censusapi dfs into 1 df
long = pd.DataFrame()

for key, df in censusapi.items():
    key = pd.DataFrame(df)
    long = long.append(key)

## Initial cleaning for tidycensus dfs

In [6]:
# Downloaded from tidycensus
emp = pd.read_csv('../data/Census/employment_tract.csv') 
income = pd.read_csv('../data/Census/income_tract.csv') 
edu = pd.read_csv('../data/Census/educational_attainment_tract.csv') 
pov = pd.read_csv('../data/Census/poverty_tract.csv') 
pov_fam = pd.read_csv('../data/Census/poverty_families_tract.csv')
food = pd.read_csv('../data/Census/food_stamps_tract.csv') 

In [7]:
# Save all the dfs into a dictionary
tidycensus = {'emp': emp, 'income': income, 'edu': edu, 'pov': pov, 
              'pov_fam': pov_fam, 'food': food}

for key, df in tidycensus.items():
    df.GEOID = df.GEOID.astype(str)
    df.GEOID = df.GEOID.str.zfill(11)
    long_df = df.melt(id_vars = ['GEOID', 'NAME', 'year', 'variable'], var_name = ['type'], value_vars = ['estimate', 'moe'])
    long_df['type'].replace({'estimate': 'E', 'moe': 'M'}, inplace = True)
    long_df['variable'] = long_df['variable'] + long_df['type']
    long_df.drop(columns = ['NAME', 'type'], inplace = True)
    tidycensus[key] = long_df

In [8]:
# Write all the tidycensus dfs into 1 df
long2 = pd.DataFrame()

for key, df in tidycensus.items():
    key = pd.DataFrame(df)
    long2 = long2.append(key)

## Append dfs together & export as parquet

In [9]:
all = long.append(long2)
all.head()

Unnamed: 0,GEOID,year,variable,value
0,6037141400,2010,B01003_001E,4493.0
1,6037141500,2010,B01003_001E,2715.0
2,6037141600,2010,B01003_001E,3693.0
3,6037141700,2010,B01003_001E,3002.0
4,6037143100,2010,B01003_001E,3900.0


In [10]:
all.to_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')

## Clean up long df
* Keep long, easier to rename what the variable is

In [11]:
df = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')

In [12]:
#df.to_stata('../data/raw_census.dta')

In [None]:
df = df[df.GEOID=='06037141400']

## Tag the table

In [13]:
# Generate column that identifies the table it belongs to
table = []

for row in df.variable:
    if row.find('B01003')==0:
        table.append('pop')
    elif row.find('B25001')==0:
        table.append('housing')
    elif row.find('S1903')==0:
        table.append('income')
    elif row.find('S1501')==0:
        table.append('edu')
    elif row.find('S1701')==0:
        table.append('pov')
    elif row.find('S1702')==0:
        table.append('povfam')
    elif row.find('S2201')==0:
        table.append('food')
    elif row.find('S2301')==0:
        table.append('emp')
    elif row.find('B19058')==0:
        table.append('pubassist')
    elif row.find('B19067')==0:
        table.append('aggpubassist')
    else:
        table.append('NA')

df['table'] = table

In [14]:
df.table.value_counts()

food            1731348
pov             1266840
emp              394128
edu              375360
income            93840
pubassist         75072
aggpubassist      37536
housing           37536
pop               37536
povfam            28152
Name: table, dtype: int64

In [19]:
# Drop tables that don't have consistent variables over the years
drop_tables = ['pov', 'food', 'povfam']
df = df[~df.table.isin(drop_tables)]

## Tag main variable

In [21]:
def pop_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_pop'
    else:
        return 'NA'

In [22]:
def housing_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_housing'
    else:
        return 'NA'

In [23]:
def emp_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop16'
    elif row.variable.find('_C02') != -1:
        return 'lf16'
    elif row.variable.find('_C03') != -1:
        return 'emp16'
    elif row.variable.find('_C04') != -1:
        return 'unemprate16'
    else:
        return 'NA'

In [24]:
def income_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'medincome'
    else:
        return 'NA'

In [None]:
""" IGNORE TABLES

# Table S1701 (poverty). Variables are not consistent from 2012-2017.
def pov_var2(row):
    if row.variable.find('_001') != -1:
        return 'pop_pov'
        
# Table S1702 (poverty for families). Variables are not consistent from 2010-2017.        
# Table S2201 (food stamps). Variables are not consistent from 2012-2017.

def pov_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop'
    elif row.variable.find('_C02') != -1:
        return 'pop_pov'
    elif row.variable.find('_C03') != -1:
        return 'pct_pov'
    else:
        return 'NA'
    
    
def food_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'pct_hh'
    elif row.variable.find('_C03') != -1:
        return 'hh_food'
    elif row.variable.find('_C04') != -1:
        return 'pct_hh_food'
    else:
        return 'NA'
"""

In [25]:
def edu_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop'
    else:
        return 'NA'

In [26]:
def pubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_002') != -1:
        return 'hh_pubassist'
    else:
        return 'NA'

In [27]:
def aggpubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'aggincome'
    else:
        return 'NA'

In [28]:
def pick_table(row):
    if row.table=='pop':
        return pop_vars(row)
    elif row.table=='housing':
        return housing_vars(row)
    elif row.table=='emp':
        return emp_vars(row)
    elif row.table=='income':
        return income_vars(row)
    elif row.table=='edu':
        return edu_vars(row)
    elif row.table=='pubassist':
        return pubassist_vars(row)
    elif row.table=='aggpubassist':
        return aggpubassist_vars(row)

df['main_var'] = df.apply(pick_table, axis = 1)

## Tag secondary variable

In [29]:
def emp_var2(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_009') != -1:
        return 'race1'
    elif row.variable.find('_010') != -1:
        return 'white'
    elif row.variable.find('_011') != -1:
        return 'black'
    elif row.variable.find('_012') != -1:
        return 'amerind'
    elif row.variable.find('_013') != -1:
        return 'asian'
    elif row.variable.find('_014') != -1:
        return 'pacis'
    elif row.variable.find('_015') != -1:
        return 'other'
    elif row.variable.find('_016') != -1:
        return 'race2'
    elif row.variable.find('_017') != -1:
        return 'hisp'
    elif row.variable.find('_018') != -1:
        return 'nonhisp'
    elif row.variable.find('_019') != -1:
        return 'pop20'
    elif row.variable.find('_020') != -1:
        return 'male'
    elif row.variable.find('_021') != -1:
        return 'female'
    elif row.variable.find('_022') != -1:
        return 'femalewchild'
    elif row.variable.find('_023') != -1:
        return 'pov'
    elif row.variable.find('_025') != -1:
        return 'pop25'
    elif row.variable.find('_026') != -1:
        return 'lhs'
    elif row.variable.find('_027') != -1:
        return 'hs'
    elif row.variable.find('_028') != -1:
        return 'college'
    elif row.variable.find('_029') != -1:
        return 'ba'
    else:
        return 'NA'

In [30]:
def income_var2(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_002') != -1:
        return 'white'
    elif row.variable.find('_003') != -1:
        return 'black'
    elif row.variable.find('_004') != -1:
        return 'amerind'
    elif row.variable.find('_005') != -1:
        return 'asian'
    elif row.variable.find('_006') != -1:
        return 'pacis'
    elif row.variable.find('_007') != -1:
        return 'other'
    elif row.variable.find('_008') != -1:
        return 'race2'
    elif row.variable.find('_009') != -1:
        return 'hisp'
    elif row.variable.find('_010') != -1:
        return 'nonhisp'
    else:
        return 'NA'

In [31]:
def edu_var2(row):
    if row.variable.find('_006') != -1:
        return 'pop25'
    elif row.variable.find('_007') != -1:
        return 'pop25_9'
    elif row.variable.find('_008') != -1:
        return 'pop25_12'
    elif row.variable.find('_009') != -1:
        return 'pop25_hs'
    elif row.variable.find('_010') != -1:
        return 'pop25_college'
    elif row.variable.find('_011') != -1:
        return 'pop25_aa'
    elif row.variable.find('_012') != -1:
        return 'pop25_ba'
    elif row.variable.find('_013') != -1:
        return 'pop25_ma'
    elif row.variable.find('_014') != -1:
        return 'pop25_pct_hsplus'
    elif row.variable.find('_015') != -1:
        return 'pop25_pct_baplus'
    else:
        return 'NA'

In [32]:
def pick_secondary_var(row):
    if row.table=='emp':
        return emp_var2(row)
    elif row.table=='income':
        return income_var2(row)
    elif row.table=='edu':
        return edu_var2(row)

df['second_var'] = df.apply(pick_secondary_var, axis = 1)

## Tag estimate or margin of error

In [78]:
# Generate column that identifies whether it's estimate or margin of error (might drop margin of error later)
est_moe = []

for row in df.variable:
    if row.find('E') != -1:
        est_moe.append('')
    elif row.find('M') != -1:
        est_moe.append('_moe')

df['est_moe'] = est_moe

## Construct variable name that can be used to rename columns

In [79]:
# Drop margin of error obs
df = df[df.est_moe != 'moe']

In [80]:
# Fill in "None" with empty string
for col in ['main_var', 'second_var']:
    df[col] = df[col].fillna(value = "") 

In [81]:
# Construct our new variable name
df['new_var'] = df.apply(lambda row: (row.main_var) if row.second_var=="" 
                         else (row.main_var + '_' + row.second_var), axis = 1)

In [82]:
df.head()

Unnamed: 0,GEOID,year,variable,value,table,main_var,second_var,new_var,length,est_moe
0,6037141400,2010,B01003_001E,4493.0,pop,tot_pop,,tot_pop,12,
1,6037141500,2010,B01003_001E,2715.0,pop,tot_pop,,tot_pop,12,
2,6037141600,2010,B01003_001E,3693.0,pop,tot_pop,,tot_pop,12,
3,6037141700,2010,B01003_001E,3002.0,pop,tot_pop,,tot_pop,12,
4,6037143100,2010,B01003_001E,3900.0,pop,tot_pop,,tot_pop,12,
