# Additional cleaning ACS table
* Might have to convert from long to wide to derive percents?
* Create number and percent columns to store values (better for aggregating later on)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [None]:
df = pd.read_parquet('../data/raw_census_long.parquet')
df = df[df.GEOID=='06037141400']

In [None]:
df.table.value_counts()

## Employment

In [None]:
emp = df[df.table=='emp']

In [None]:
def emp_type(row):
    if (row.main_var=='pop'):
        return 'number'
    elif (row.main_var != 'pop'):
        return 'percent'
    
emp['var_type'] = emp.apply(emp_type, axis = 1)

In [None]:
emp_pop = emp[emp.main_var=='pop']

emp_dfs = {}

# For each table, do a merge with pop numbers, convert the percents back to numbers and save into dictionary
for subset in ['lf', 'epr', 'unemp']:
    new_pct_col = "pct"
    new_num_col = "num"
    subset_df = emp[emp.main_var == subset]
    merged = pd.merge(emp_pop, subset_df, on = ['GEOID', 'year', 'table', 'second_var'])
    merged = merged.drop(columns = ['variable_x', 'main_var_x', 'new_var_x'])
    merged.rename(columns = {'value_x': 'num_pop', 'value_y': new_pct_col}, inplace = True)
    merged[new_pct_col] = merged[new_pct_col] / 100
    merged[new_num_col] = merged.num_pop * merged[new_pct_col]
    merged = merged[['GEOID', 'year', 'table', 'second_var', 'variable_y', 'main_var_y', 'new_var_y', new_pct_col, new_num_col]]
    merged.rename(columns = {'variable_y': 'variable', 'main_var_y': 'main_var', 'new_var_y': 'new_var'}, inplace = True)
    emp_dfs[subset] = merged

In [None]:
emp_pop['num'] = emp_pop.value
emp_pop['pct'] = 1
emp_pop = emp_pop.drop(columns = ['value', 'var_type'])
emp_pop.head()

In [None]:
# Append the emp dfs together to be long
appended = emp_pop

for key, value in emp_dfs.items():
    appended = appended.append(value, sort = False)

In [None]:
# Save result into dictionary
final_dfs = {'emp': appended}

## Income

In [None]:
income = df[df.table=='income']

In [None]:
def income_type(row):
    if (row.main_var=='hh') & (row.second_var=='total'):
        return 'number'
    elif (row.main_var == 'medincome'):
        return 'dollar'
    elif (row.main_var=='hh') & (row.second_var != 'hh') & (row.year <= 2016):
        return 'percent'
    elif (row.main_var=='hh') & (row.second_var != 'hh') & (row.year >= 2017):
        return 'number'
    
income['var_type'] = income.apply(income_type, axis = 1)

In [None]:
# Create a denominator column. Use this to convert percent values into numbers.
income['denom'] = income.apply(lambda row: row.value if row.new_var=='hh_total' else np.nan, axis = 1)
income['denom'] = income['denom'].fillna(income.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
# Replace values that were percents with numbers
def income_pct_col(row):
    if row.var_type == 'percent':
        return (row.value / 100)
    elif row.var_type == 'number':
        return (row.value / row.denom)
    elif row.var_type == 'dollar':
        return np.nan
    
income['pct'] = income.apply(income_pct_col, axis = 1)

income['num'] = income.apply(lambda row: row.value if row.var_type in ['number', 'dollar'] else (row.pct * row.denom), axis = 1)

In [None]:
drop_me = ['value', 'var_type', 'denom']
income = income.drop(columns = drop_me)

In [None]:
# Add df to dictionary
final_dfs.update({'income': income})

## Education

In [None]:
edu = df[df.table=='edu']

In [None]:
def edu_type(row):
    if (row.second_var.find('pct') != -1) or (row.second_var.find('pov') != -1):
        return 'percent'
    elif row.second_var.find('medearning') != -1:
        return 'dollar'
    elif (row.second_var.find('pct') == -1) & (row.second_var.find('medearning') == -1):
        return 'number'
    
edu['var_type'] = edu.apply(edu_type, axis = 1)

In [None]:
# Create a denominator column. Use this to convert percent values into numbers.
edu['denom'] = edu.apply(lambda row: row.value if row.new_var=='pop_total_pop25' else np.nan, axis = 1)
edu['denom'] = edu['denom'].fillna(edu.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
# Replace values that were percents with numbers
def edu_pct_col(row):
    if row.var_type == 'percent':
        return (row.value / 100)
    elif row.var_type == 'number':
        return (row.value / row.denom)
    elif row.var_type == 'dollar':
        return np.nan

edu['pct'] = edu.apply(edu_pct_col, axis = 1)

edu['num'] = edu.apply(lambda row: row.value if row.var_type in ['number', 'dollar'] else (row.pct * row.denom), axis = 1)

In [None]:
drop_me = ['value', 'var_type', 'denom']
edu = edu.drop(columns = drop_me)

In [None]:
# Add df to dictionary
final_dfs.update({'edu': edu})

## Population and Housing Units

In [None]:
pop_housing = df[(df.table=='pop') | (df.table=='housing') ]

In [None]:
pop_housing['var_type'] = 'number'

In [None]:
# Create a denominator column. Use this to convert percent values into numbers.
pop_housing['denom'] = pop_housing.apply(lambda row: row.value if row.new_var in ['pop', 'housing'] else np.nan, axis = 1)
pop_housing['denom'] = pop_housing['denom'].fillna(pop_housing.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
# No adjustments needed
pop_housing['pct'] = pop_housing.value / pop_housing.denom

pop_housing['num'] = pop_housing.apply(lambda row: row.value if row.var_type in ['number', 'dollar'] else (row.pct * row.denom), axis = 1)

In [None]:
drop_me = ['value', 'var_type', 'denom']
pop_housing = pop_housing.drop(columns = drop_me)

In [None]:
# Add df to dictionary
final_dfs.update({'pop_housing': pop_housing})

## Poverty

In [None]:
povfam = df[df.table=='povfam']

In [None]:
def povfam_type(row):
    if row.main_var.find('pov') != -1:
        return 'percent'
    else:
        return 'number'
    
povfam['var_type'] = povfam.apply(povfam_type, axis = 1)

In [None]:
povfam['denom'] = povfam.apply(lambda row: row.value if row.new_var=='fam' else np.nan, axis = 1)
povfam['denom'] = povfam['denom'].fillna(povfam.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
povfam

In [None]:
# Replace values that were percents with numbers
def povfam_pct_col(row):
    if row.var_type == 'percent':
        return (row.value / 100)
    elif row.var_type == 'number':
        return (row.value / row.denom)
    elif row.var_type == 'dollar':
        return np.nan

povfam['pct'] = povfam.apply(povfam_pct_col, axis = 1)

povfam['num'] = povfam.apply(lambda row: row.value if row.var_type in ['number', 'dollar'] else (row.pct * row.denom), axis = 1)

In [None]:
drop_me = ['value', 'var_type', 'denom']
povfam = povfam.drop(columns = drop_me)

In [None]:
# Add df to dictionary
final_dfs.update({'povfam': povfam})

## Food Stamps

In [None]:
food = df[df.table=='food']

In [None]:
def food_type(row):
    if row.second_var == 'total':
        return 'number'
    elif row.second_var.find('income') != -1:
        return 'dollar'
    elif (row.second_var == 'pov') & (row.year <= 2014):
        return 'percent'
    elif (row.second_var == 'pov') & (row.year >= 2015):
        return 'number'
    
food['var_type'] = food.apply(food_type, axis = 1)

In [None]:
# Make sure denominator includes main_var. Double checked with American Fact Finder that values and percents match reported.
food['denom'] = food.apply(lambda row: row.value if row.second_var=='total' else np.nan, axis = 1)
food['denom'] = food['denom'].fillna(food.groupby(['GEOID', 'year', 'main_var'])['denom'].transform('max'))

In [None]:
# Replace values that were percents with numbers
def food_pct_col(row):
    if row.var_type == 'percent':
        return (row.value / 100)
    elif row.var_type == 'number':
        return (row.value / row.denom)
    elif row.var_type == 'dollar':
        return np.nan

food['pct'] = food.apply(food_pct_col, axis = 1)

food['num'] = food.apply(lambda row: row.value if row.var_type in ['number', 'dollar'] else (row.pct * row.denom), axis = 1)

In [None]:
drop_me = ['value', 'var_type', 'denom']
food = food.drop(columns = drop_me)

In [None]:
# Add df to dictionary
final_dfs.update({'food': food})

In [None]:
df.to_stata('../data/raw_census_long.dta')

In [None]:
pubassist = df[df.table=='pubassist']
# These are number of families that received food stamps
# Need to derive percents