# Get the raw Census data into shape
* move to script when ready
* [Census GEOIDs](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [None]:
"""
# Downloaded from censusapi
pop = pd.read_csv('../data/Census/population_tract.csv')
housing = pd.read_csv('../data/Census/housing_units_tract.csv')
pub = pd.read_csv('../data/Census/public_assistance_tract.csv') 
agg_pub = pd.read_csv('../data/Census/aggregate_public_assistance_tract.csv')

# Downloaded from tidycensus
emp = pd.read_csv('../data/Census/employment_tract.csv') 
income = pd.read_csv('../data/Census/income_tract.csv') 
edu = pd.read_csv('../data/Census/educational_attainment_tract.csv') 
pov = pd.read_csv('../data/Census/poverty_tract.csv') 
pov_fam = pd.read_csv('../data/Census/poverty_families_tract.csv')
food = pd.read_csv('../data/Census/food_stamps_tract.csv') 
"""

## Clean up long df
* Keep long, easier to rename what the variable is

In [2]:
df = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_data.parquet')

In [3]:
df = df[df.GEOID=='06037141400']

## Tag the table

In [4]:
# Generate column that identifies the table it belongs to
table = []

for row in df.variable:
    if row.find('B01003')==0:
        table.append('pop')
    elif row.find('B25001')==0:
        table.append('housing')
    elif row.find('S1903')==0:
        table.append('income')
    elif row.find('S1501')==0:
        table.append('edu')
    elif row.find('S1701')==0:
        table.append('pov')
    elif row.find('S1702')==0:
        table.append('povfam')
    elif row.find('S2201')==0:
        table.append('food')
    elif row.find('S2301')==0:
        table.append('emp')
    elif row.find('B19058')==0:
        table.append('pubassist')
    elif row.find('B19067')==0:
        table.append('aggpubassist')
    else:
        table.append('NA')

df['table'] = table

In [5]:
df.table.value_counts()

emp             1280
income           320
edu              320
food              96
pubassist         32
pop               16
povfam            16
housing           16
aggpubassist      16
Name: table, dtype: int64

In [6]:
for t in ['pop', 'housing', 'income', 'edu', 'pov', 'povfam', 'food', 'emp', 'pubassist', 'aggpubassist']:
    display(t)
    display(df[df.table==t].year.value_counts())

'pop'

2015    2
2014    2
2013    2
2012    2
2011    2
2010    2
2017    2
2016    2
Name: year, dtype: int64

'housing'

2015    2
2014    2
2013    2
2012    2
2011    2
2010    2
2017    2
2016    2
Name: year, dtype: int64

'income'

2017    40
2016    40
2015    40
2014    40
2013    40
2012    40
2011    40
2010    40
Name: year, dtype: int64

'edu'

2017    40
2016    40
2015    40
2014    40
2013    40
2012    40
2011    40
2010    40
Name: year, dtype: int64

'pov'

Series([], Name: year, dtype: int64)

'povfam'

2015    2
2014    2
2013    2
2012    2
2011    2
2010    2
2017    2
2016    2
Name: year, dtype: int64

'food'

2017    12
2016    12
2015    12
2014    12
2013    12
2012    12
2011    12
2010    12
Name: year, dtype: int64

'emp'

2017    160
2016    160
2015    160
2014    160
2013    160
2012    160
2011    160
2010    160
Name: year, dtype: int64

'pubassist'

2015    4
2014    4
2013    4
2012    4
2011    4
2010    4
2017    4
2016    4
Name: year, dtype: int64

'aggpubassist'

2015    2
2014    2
2013    2
2012    2
2011    2
2010    2
2017    2
2016    2
Name: year, dtype: int64

## Tag main variable

In [7]:
def pop_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_pop'
    else:
        return 'NA'

In [8]:
def housing_vars(row): 
    if row.variable.find('_001') != -1:
        return 'tot_housing'
    else:
        return 'NA'

In [10]:
def emp_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop16'
    elif row.variable.find('_C02') != -1:
        return 'lf16'
    elif row.variable.find('_C03') != -1:
        return 'emp16'
    elif row.variable.find('_C04') != -1:
        return 'unempr16'
    else:
        return 'NA'

In [9]:
def income_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'medincome'
    else:
        return 'NA'

In [None]:
""" IGNORE TABLES

# Table S1701 (poverty). Variables are not consistent from 2012-2017.
def pov_var2(row):
    if row.variable.find('_001') != -1:
        return 'pop_pov'
        
# Table S1702 (poverty for families). Variables are not consistent from 2010-2017.        
# Table S2201 (food stamps). Variables are not consistent from 2012-2017.

def pov_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop'
    elif row.variable.find('_C02') != -1:
        return 'pop_pov'
    elif row.variable.find('_C03') != -1:
        return 'pct_pov'
    else:
        return 'NA'
    
    
def food_vars(row):
    if row.variable.find('_C01') != -1:
        return 'hh'
    elif row.variable.find('_C02') != -1:
        return 'pct_hh'
    elif row.variable.find('_C03') != -1:
        return 'hh_food'
    elif row.variable.find('_C04') != -1:
        return 'pct_hh_food'
    else:
        return 'NA'
"""

In [11]:
def edu_vars(row):
    if row.variable.find('_C01') != -1:
        return 'pop'
    else:
        return 'NA'

In [12]:
def pubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_002') != -1:
        return 'hh_pubassist'
    else:
        return 'NA'

In [13]:
def aggpubassist_vars(row):
    if row.variable.find('_001') != -1:
        return 'aggincome'
    else:
        return 'NA'

In [14]:
def pick_table(row):
    if row.table=='pop':
        return pop_vars(row)
    elif row.table=='housing':
        return housing_vars(row)
    elif row.table=='emp':
        return emp_vars(row)
    elif row.table=='income':
        return income_vars(row)
    elif row.table=='edu':
        return edu_vars(row)
    elif row.table=='pubassist':
        return pubassist_vars(row)
    elif row.table=='aggpubassist':
        return aggpubassist_vars(row)

df['main_var'] = df.apply(pick_table, axis = 1)

## Tag secondary variable

In [None]:
def emp_var2010(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_010') != -1:
        return 'white'
    elif row.variable.find('_011') != -1:
        return 'black'
    elif row.variable.find('_012') != -1:
        return 'amerind'
    elif row.variable.find('_013') != -1:
        return 'asian'
    elif row.variable.find('_014') != -1:
        return 'pacis'
    elif row.variable.find('_015') != -1:
        return 'other'
    elif row.variable.find('_016') != -1:
        return 'race2'
    elif row.variable.find('_017') != -1:
        return 'hisp'
    elif row.variable.find('_018') != -1:
        return 'nonhisp'
    elif row.variable.find('_019') != -1:
        return 'pop20'
    elif row.variable.find('_020') != -1:
        return 'male'
    elif row.variable.find('_021') != -1:
        return 'female'
    elif row.variable.find('_022') != -1:
        return 'femalewchild'
    elif row.variable.find('_023') != -1:
        return 'pov'
    elif row.variable.find('_025') != -1:
        return 'pop25'
    elif row.variable.find('_026') != -1:
        return 'lhs'
    elif row.variable.find('_027') != -1:
        return 'hs'
    elif row.variable.find('_028') != -1:
        return 'college'
    elif row.variable.find('_029') != -1:
        return 'ba'
    else:
        return 'NA'

In [None]:
def emp_var2015(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_012') != -1:
        return 'white'
    elif row.variable.find('_013') != -1:
        return 'black'
    elif row.variable.find('_014') != -1:
        return 'amerind'
    elif row.variable.find('_015') != -1:
        return 'asian'
    elif row.variable.find('_016') != -1:
        return 'pacis'
    elif row.variable.find('_017') != -1:
        return 'other'
    elif row.variable.find('_018') != -1:
        return 'race2'
    elif row.variable.find('_019') != -1:
        return 'hisp'
    elif row.variable.find('_020') != -1:
        return 'nonhisp'
    elif row.variable.find('_021') != -1:
        return 'pop20'
    elif row.variable.find('_022') != -1:
        return 'male'
    elif row.variable.find('_023') != -1:
        return 'female'
    elif row.variable.find('_025') != -1:
        return 'femalewchild'
    elif row.variable.find('_028') != -1:
        return 'pov'
    elif row.variable.find('_031') != -1:
        return 'pop25'
    elif row.variable.find('_032') != -1:
        return 'lhs'
    elif row.variable.find('_033') != -1:
        return 'hs'
    elif row.variable.find('_034') != -1:
        return 'college'
    elif row.variable.find('_035') != -1:
        return 'ba'
    else:
        return 'NA'

In [None]:
def income_var2(row):
    if row.variable.find('_001') != -1:
        return 'hh'
    elif row.variable.find('_002') != -1:
        return 'white'
    elif row.variable.find('_003') != -1:
        return 'black'
    elif row.variable.find('_004') != -1:
        return 'amerind'
    elif row.variable.find('_005') != -1:
        return 'asian'
    elif row.variable.find('_006') != -1:
        return 'pacis'
    elif row.variable.find('_007') != -1:
        return 'other'
    elif row.variable.find('_008') != -1:
        return 'race2'
    elif row.variable.find('_009') != -1:
        return 'hisp'
    elif row.variable.find('_010') != -1:
        return 'nonhisp'
    else:
        return 'NA'

In [None]:
def edu_var2010(row):
    if row.variable.find('_006') != -1:
        return 'pop25'
    elif row.variable.find('_007') != -1:
        return 'hs9'
    elif row.variable.find('_008') != -1:
        return 'hs12'
    elif row.variable.find('_009') != -1:
        return 'hs'
    elif row.variable.find('_010') != -1:
        return 'college'
    elif row.variable.find('_011') != -1:
        return 'aa'
    elif row.variable.find('_012') != -1:
        return 'ba'
    elif row.variable.find('_013') != -1:
        return 'ma'
    elif row.variable.find('_014') != -1:
        return 'pct_hsplus'
    elif row.variable.find('_015') != -1:
        return 'pct_baplus'
    elif row.variable.find('_028') != -1:
        return 'pov_lhs'
    elif row.variable.find('_029') != -1:
        return 'pov_hs'
    elif row.variable.find('_030') != -1:
        return 'college_pov'
    elif row.variable.find('_031') != -1:
        return 'ba_pov'
    elif row.variable.find('_032') != -1:
        return 'pop25_medearning'
    elif row.variable.find('_033') != -1:
        return 'lhs_medearning'
    elif row.variable.find('_034') != -1:
        return 'hs_medearning'
    elif row.variable.find('_035') != -1:
        return 'college_medearning'
    elif row.variable.find('_036') != -1:
        return 'ba_medearning'
    elif row.variable.find('_037') != -1:
        return 'ma_medearning'
    else:
        return 'NA'

In [None]:
def edu_var2015(row):
    if row.variable.find('_006') != -1:
        return 'pop25'
    elif row.variable.find('_007') != -1:
        return 'hs9'
    elif row.variable.find('_008') != -1:
        return 'hs12'
    elif row.variable.find('_009') != -1:
        return 'hs'
    elif row.variable.find('_010') != -1:
        return 'college'
    elif row.variable.find('_011') != -1:
        return 'aa'
    elif row.variable.find('_012') != -1:
        return 'ba'
    elif row.variable.find('_013') != -1:
        return 'ma'
    elif row.variable.find('_014') != -1:
        return 'pct_hsplus'
    elif row.variable.find('_015') != -1:
        return 'pct_baplus'
    elif row.variable.find('_055') != -1:
        return 'pov_lhs'
    elif row.variable.find('_056') != -1:
        return 'pov_hs'
    elif row.variable.find('_057') != -1:
        return 'college_pov'
    elif row.variable.find('_058') != -1:
        return 'ba_pov'
    elif row.variable.find('_059') != -1:
        return 'pop25_medearning'
    elif row.variable.find('_060') != -1:
        return 'lhs_medearning'
    elif row.variable.find('_061') != -1:
        return 'hs_medearning'
    elif row.variable.find('_062') != -1:
        return 'college_medearning'
    elif row.variable.find('_063') != -1:
        return 'ba_medearning'
    elif row.variable.find('_064') != -1:
        return 'ma_medearning'
    else:
        return 'NA'

In [None]:
edu2010 = {'06': 'pop25', '07': 'hs9', '08': 'hs12', '09': 'hs', '10': 'college',
           '11': 'aa', '12': 'ba', '13': 'ma', '14': 'pct_hsplus', '15': 'pct_baplus', 
           '28': 'pov_lhs', '29': 'pov_hs', '30': 'college_pov', '31': 'ba_pov', '32': 'pop25_medearning',
           '33': 'lhs_medearning', '34': 'hs_medearning', '35': 'college_medearning', '36': 'ba_medearning', '37': 'ma_medearning'}


edu2015 = {'06': 'pop25', '07': 'hs9', '08': 'hs12', '09': 'hs', '10': 'college',
           '11': 'aa', '12': 'ba', '13': 'ma', '14': 'pct_hsplus', '15': 'pct_baplus', 
           '55': 'pov_lhs', '56': 'pov_hs', '57': 'college_pov', '58': 'ba_pov', '59': 'pop25_medearning',
           '60': 'lhs_medearning', '61': 'hs_medearning', '62': 'college_medearning', '63': 'ba_medearning', '64': 'ma_medearning'}

In [None]:
def pick_secondary_var(row):
    if (row.table=='emp') & (row.year <= 2014):
        return emp_var2010(row)
    elif (row.table=='emp') & (row.year >= 2015):
        return emp_var2015(row)
    elif row.table=='income':
        return income_var2(row)
    elif (row.table=='edu') & (row.year <= 2014):
        return edu_var2010(row)
    elif (row.table=='edu') & (row.year >= 2015):
        return edu_var2015(row)
    

df['second_var'] = df.apply(pick_secondary_var, axis = 1)

In [16]:
# Change the large section of if-statements to dictionary instead!

mapping = {'CXX': 'white', 'CYY': 'black'}

id = 'CXX8372472'

# Counts from 0-3, but doesn't include the 3rd character
new_val = mapping[id[:3]]
new_val

'white'

## Tag estimate or margin of error

In [None]:
# Generate column that identifies whether it's estimate or margin of error (might drop margin of error later)
est_moe = []

for row in df.variable:
    if row.find('E') != -1:
        est_moe.append('est')
    elif row.find('M') != -1:
        est_moe.append('moe')

df['est_moe'] = est_moe

## Construct variable name that can be used to rename columns

In [None]:
# Drop margin of error obs
df = df[df.est_moe != 'moe']

In [None]:
# Fill in "None" with empty string
for col in ['main_var', 'second_var']:
    df[col] = df[col].fillna(value = "") 

In [None]:
# Construct our new variable name
df['new_var'] = df.apply(lambda row: (row.main_var) if row.second_var=="" 
                         else (row.main_var + '_' + row.second_var), axis = 1)

In [None]:
df.head()

In [None]:
df.to_stata('../data/raw_census.dta')

In [None]:
df.new_var.value_counts()