# Get the raw Census data into shape
* move to script when ready
* [Census GEOIDs](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
"""
# Downloaded from censusapi
pop = pd.read_csv('../data/Census/population_tract.csv')
housing = pd.read_csv('../data/Census/housing_units_tract.csv')
pub = pd.read_csv('../data/Census/public_assistance_tract.csv') 
agg_pub = pd.read_csv('../data/Census/aggregate_public_assistance_tract.csv')

# Downloaded from tidycensus
emp = pd.read_csv('../data/Census/employment_tract.csv') 
income = pd.read_csv('../data/Census/income_tract.csv') 
edu = pd.read_csv('../data/Census/educational_attainment_tract.csv') 
pov = pd.read_csv('../data/Census/poverty_tract.csv') 
pov_fam = pd.read_csv('../data/Census/poverty_families_tract.csv')
food = pd.read_csv('../data/Census/food_stamps_tract.csv') 
"""

"\n# Downloaded from censusapi\npop = pd.read_csv('../data/Census/population_tract.csv')\nhousing = pd.read_csv('../data/Census/housing_units_tract.csv')\npub = pd.read_csv('../data/Census/public_assistance_tract.csv') \nagg_pub = pd.read_csv('../data/Census/aggregate_public_assistance_tract.csv')\n\n# Downloaded from tidycensus\nemp = pd.read_csv('../data/Census/employment_tract.csv') \nincome = pd.read_csv('../data/Census/income_tract.csv') \nedu = pd.read_csv('../data/Census/educational_attainment_tract.csv') \npov = pd.read_csv('../data/Census/poverty_tract.csv') \npov_fam = pd.read_csv('../data/Census/poverty_families_tract.csv')\nfood = pd.read_csv('../data/Census/food_stamps_tract.csv') \n"

## Clean up long df
* Keep long, easier to rename what the variable is

In [3]:
df = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')
df = df[df.GEOID=='06037141400']

## Tag the table

In [4]:
acs_tables = {'B01003': 'pop', 'B25001': 'housing', 
             'S1903': 'income', 'S2301': 'emp', 'S1501': 'edu',
             'S1702': 'povfam', 'S2201': 'food', 
             'B19058': 'pubassist', 'B19067': 'aggpubassist'}

In [5]:
# Identify where to extract the ACS table name
df['cut'] = df.variable.str.find('_')
df['table_name'] = df.apply(lambda row: row.variable[: row.cut], axis = 1)

# Map the table_name to the new label using a dictionary
df['table'] = df.table_name.map(acs_tables)

In [6]:
"""
for t in ['pop', 'housing', 'income', 'edu', 'povfam', 'food', 'emp', 'pubassist', 'aggpubassist']:
    display(t)
    display(df[df.table==t].year.value_counts())
"""

"\nfor t in ['pop', 'housing', 'income', 'edu', 'povfam', 'food', 'emp', 'pubassist', 'aggpubassist']:\n    display(t)\n    display(df[df.table==t].year.value_counts())\n"

## Tag main variable

In [7]:
def pop_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_pop'

In [8]:
def housing_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_housing'

In [9]:
def emp_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop16'
    elif row.variable.find('_C02') != -1:
        return 'lf16'
    elif row.variable.find('_C03') != -1:
        return 'emp16'
    elif row.variable.find('_C04') != -1:
        return 'unempr16'

In [10]:
def income_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'medincome'

In [11]:
def edu_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop'

In [12]:
def povfam_vars(row):
    if row.variable.find('_C01') != -1:
        return 'families'
    elif row.variable.find('_C02') != -1:
        return 'families_pov'

In [13]:
def food_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'hh_food'
    elif row.variable.find('_C03') != -1:
        return 'hh_food'

In [14]:
def pubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_002') != -1:
        return 'hh_pubassist'

In [15]:
def aggpubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'aggincome'

In [16]:
def pick_table(row):
    if row.table=='pop':
        return pop_vars(row)
    elif row.table=='housing':
        return housing_vars(row)
    elif row.table=='emp':
        return emp_vars(row)
    elif row.table=='income':
        return income_vars(row)
    elif row.table=='edu':
        return edu_vars(row)
    elif row.table=='povfam':
        return povfam_vars(row)
    elif row.table=='food':
        return food_vars(row)
    elif row.table=='pubassist':
        return pubassist_vars(row)
    elif row.table=='aggpubassist':
        return aggpubassist_vars(row)

df['main_var'] = df.apply(pick_table, axis = 1)

## Tag secondary variable

In [17]:
# Grab the last 2 characters of the variable column that tells us what number the question was (01, 02, ...)
df['last2'] = df.variable.str[-3:-1]
df.head()

Unnamed: 0,GEOID,year,variable,value,cut,table_name,table,main_var,last2
0,6037141400,2010,B01003_001E,4493.0,6,B01003,pop,tot_pop,1
2772,6037141400,2011,B01003_001E,4732.0,6,B01003,pop,tot_pop,1
5203,6037141400,2012,B01003_001E,4760.0,6,B01003,pop,tot_pop,1
7200,6037141400,2013,B01003_001E,4500.0,6,B01003,pop,tot_pop,1
9990,6037141400,2014,B01003_001E,4471.0,6,B01003,pop,tot_pop,1


In [18]:
emp2010 = {'01': 'pop16', '10': 'white', '11': 'black', '12': 'amerind', '13': 'asian', '14': 'pacis', '15': 'other', 
           '16': 'race2', '17': 'hisp', '18': 'nonhisp', 
           '19': 'pop20', '20': 'male', '21': 'female', '22': 'femalewchild6', 
           '23': 'pov', '25': 'pop25', '26': 'lhs', '27': 'hs', '28': 'college', '29': 'ba'}

emp2015 = {'01': 'pop16', '12': 'white', '13': 'black', '14': 'amerind', '15': 'asian', '16': 'pacis', '17': 'other', 
           '18': 'race2', '19': 'hisp', '20': 'nonhisp', 
           '21': 'pop20', '22': 'male', '23': 'female', '25': 'femalewchild6', 
           '28': 'pov', '31': 'pop25', '32': 'lhs', '33': 'hs', '34': 'college', '35': 'ba'}

In [19]:
income_vars = {'01': 'hh', '02': 'white', '03': 'black', '04': 'amerind', '05': 'asian',
           '06': 'pacis', '07': 'other', '08': 'race2', '09': 'hisp', '10': 'nonhisp'}

In [20]:
edu2010 = {'06': 'pop25', '07': 'hs9', '08': 'hs12', '09': 'hs', '10': 'college',
           '11': 'aa', '12': 'ba', '13': 'ma', '14': 'pct_hsplus', '15': 'pct_baplus', 
           '28': 'pov_lhs', '29': 'pov_hs', '30': 'college_pov', '31': 'ba_pov', '32': 'pop25_medearning',
           '33': 'lhs_medearning', '34': 'hs_medearning', '35': 'college_medearning', '36': 'ba_medearning', '37': 'ma_medearning'}


edu2015 = {'06': 'pop25', '07': 'hs9', '08': 'hs12', '09': 'hs', '10': 'college',
           '11': 'aa', '12': 'ba', '13': 'ma', '14': 'pct_hsplus', '15': 'pct_baplus', 
           '55': 'pov_lhs', '56': 'pov_hs', '57': 'college_pov', '58': 'ba_pov', '59': 'pop25_medearning',
           '60': 'lhs_medearning', '61': 'hs_medearning', '62': 'college_medearning', '63': 'ba_medearning', '64': 'ma_medearning'}

In [21]:
food2010 = {'01': 'hh', '04': 'hh_pov', '16': 'medhhincome'}

food2015 = {'01': 'hh', '21': 'hh_pov', '34': 'medhhincome'}

In [22]:
povfam_vars = {'01': 'families_pov'}

In [23]:
def pick_secondary_var(row):
    if row.table=='income':
        return income_vars[row.last2]
    elif (row.table=='emp') & (row.year <= 2014):
        return emp2010[row.last2]
    elif (row.table=='emp') & (row.year >= 2015):
        return emp2015[row.last2]
    elif (row.table=='edu') & (row.year <= 2014):
        return edu2010[row.last2]
    elif (row.table=='edu') & (row.year >= 2015):
        return edu2015[row.last2]
    elif (row.table=='food') & (row.year <= 2014):
        return food2010[row.last2]
    elif (row.table=='food') & (row.year >= 2015):
        return food2015[row.last2]
    elif row.table=='povfam':
        return povfam_vars[row.last2]
    
df['second_var'] = df.apply(pick_secondary_var, axis = 1)

In [24]:
df.head(10)

Unnamed: 0,GEOID,year,variable,value,cut,table_name,table,main_var,last2,second_var
0,6037141400,2010,B01003_001E,4493.0,6,B01003,pop,tot_pop,1,
2772,6037141400,2011,B01003_001E,4732.0,6,B01003,pop,tot_pop,1,
5203,6037141400,2012,B01003_001E,4760.0,6,B01003,pop,tot_pop,1,
7200,6037141400,2013,B01003_001E,4500.0,6,B01003,pop,tot_pop,1,
9990,6037141400,2014,B01003_001E,4471.0,6,B01003,pop,tot_pop,1,
11836,6037141400,2015,B01003_001E,4533.0,6,B01003,pop,tot_pop,1,
15549,6037141400,2016,B01003_001E,4636.0,6,B01003,pop,tot_pop,1,
17612,6037141400,2017,B01003_001E,4871.0,6,B01003,pop,tot_pop,1,
18768,6037141400,2010,B01003_001M,309.0,6,B01003,pop,tot_pop,1,
21540,6037141400,2011,B01003_001M,442.0,6,B01003,pop,tot_pop,1,


## Tag estimate or margin of error

In [25]:
# Generate column that identifies whether it's estimate or margin of error (might drop margin of error later)
df['est_moe'] = df.apply(lambda row: 'est' if row.variable[-1:]=='E' else 'moe', axis = 1)

## Construct variable name that can be used to rename columns

In [26]:
# Drop rows not needed
df = df.loc[df.est_moe != 'moe']

# Drop columns not needed
df.drop(columns = ['cut', 'table_name', 'last2', 'est_moe'], inplace = True)

In [27]:
# Fill in "None" with empty string
for col in ['main_var', 'second_var']:
    df[col] = df[col].fillna(value = "") 

In [28]:
# Construct our new variable name
df['new_var'] = df.apply(lambda row: (row.main_var) if row.second_var=="" 
                         else (row.main_var + '_' + row.second_var), axis = 1)

In [29]:
df.head()

Unnamed: 0,GEOID,year,variable,value,table,main_var,second_var,new_var
0,6037141400,2010,B01003_001E,4493.0,pop,tot_pop,,tot_pop
2772,6037141400,2011,B01003_001E,4732.0,pop,tot_pop,,tot_pop
5203,6037141400,2012,B01003_001E,4760.0,pop,tot_pop,,tot_pop
7200,6037141400,2013,B01003_001E,4500.0,pop,tot_pop,,tot_pop
9990,6037141400,2014,B01003_001E,4471.0,pop,tot_pop,,tot_pop


In [30]:
#df.to_stata('../data/raw_census.dta')

In [31]:
df.new_var.value_counts()

hh_hh                  16
unempr16_lhs            8
lf16_pov                8
pop_pop25               8
hh_food_medhhincome     8
                       ..
unempr16_ba             8
hh_asian                8
emp16_college           8
pop_college_pov         8
pop_ma                  8
Name: new_var, Length: 131, dtype: int64