# Reshape by ACS table

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
df = pd.read_parquet('../data/raw_census_long.parquet')
df = df[df.GEOID=='06037141400']

In [3]:
df.table.value_counts()

emp             640
edu             160
income          160
food             48
povfam           16
pubassist        16
aggpubassist      8
pop               8
housing           8
Name: table, dtype: int64

## Employment

In [48]:
emp = df[df.table=='emp']

In [49]:
def emp_type(row):
    if (row.main_var=='pop'):
        return 'number'
    elif (row.main_var != 'pop'):
        return 'percent'
    
emp['var_type'] = emp.apply(emp_type, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [50]:
emp_pop16 = emp[emp.main_var=='pop']

emp_dfs = {}

# For each table, do a merge with pop numbers, convert the percents back to numbers and save into dictionary
for subset in ['lf', 'epr', 'unempr']:
    new_pct_col = "pct"
    new_num_col = "num"
    subset_df = emp[emp.main_var == subset]
    merged = pd.merge(emp_pop16, subset_df, on = ['GEOID', 'year', 'table', 'second_var'])
    merged = merged.drop(columns = ['variable_x', 'main_var_x', 'new_var_x'])
    merged.rename(columns = {'value_x': 'num_pop', 'value_y': new_pct_col}, inplace = True)
    merged[new_pct_col] = merged[new_pct_col] / 100
    merged[new_num_col] = merged.num_pop * merged[new_pct_col]
    merged = merged[['GEOID', 'year', 'table', 'second_var', 'variable_y', 'main_var_y', 'new_var_y', new_pct_col, new_num_col]]
    merged.rename(columns = {'variable_y': 'variable', 'main_var_y': 'main_var', 'new_var_y': 'new_var'}, inplace = True)
    emp_dfs[subset] = merged

In [51]:
emp_pop['num'] = emp_pop.value
emp_pop['pct'] = 1
emp_pop = emp_pop.drop(columns = ['value', 'var_type'])
emp_pop.head()

Unnamed: 0,GEOID,year,variable,table,main_var,second_var,new_var,num_pop16,pct_pop16


In [44]:
# Append the emp dfs together to be long
appended = pd.DataFrame()

for key, value in emp_dfs.items():
    appended = appended.append(value)

Unnamed: 0,GEOID,year,variable,value,table,main_var,second_var,new_var,var_type
26890,6037141400,2010,S2301_C01_019E,2732.0,emp,pop16,pop20,pop16_pop20,number
26910,6037141400,2010,S2301_C02_019E,76.3,emp,lf16,pop20,lf16_pop20,percent
26930,6037141400,2010,S2301_C03_019E,70.7,emp,emp16,pop20,emp16_pop20,percent
26950,6037141400,2010,S2301_C04_019E,7.4,emp,unempr16,pop20,unempr16_pop20,percent
214570,6037141400,2011,S2301_C01_019E,2905.0,emp,pop16,pop20,pop16_pop20,number
214590,6037141400,2011,S2301_C02_019E,78.2,emp,lf16,pop20,lf16_pop20,percent
214610,6037141400,2011,S2301_C03_019E,72.4,emp,emp16,pop20,emp16_pop20,percent
214630,6037141400,2011,S2301_C04_019E,7.4,emp,unempr16,pop20,unempr16_pop20,percent
402330,6037141400,2012,S2301_C01_019E,2924.0,emp,pop16,pop20,pop16_pop20,number
402350,6037141400,2012,S2301_C02_019E,76.1,emp,lf16,pop20,lf16_pop20,percent


## Income

In [None]:
income = df[df.table=='income']

In [None]:
def income_type(row):
    if (row.main_var=='hh') & (row.second_var=='hh'):
        return 'number'
    elif (row.main_var == 'medincome'):
        return 'dollar'
    elif (row.main_var=='hh') & (row.second_var != 'hh') & (row.year <= 2016):
        return 'percent'
    elif (row.main_var=='hh') & (row.second_var != 'hh') & (row.year >= 2017):
        return 'number'
    
income['var_type'] = income.apply(income_type, axis = 1)

In [None]:
income['denom'] = income.apply(lambda row: row.value if row.new_var=='hh_hh' else np.nan, axis = 1)
income['denom'] = income['denom'].fillna(income.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
# Replace values that were percents with numbers
def income_adjust(row):
    if (row.var_type=='percent'):
        return (row.value/100) * row.denom
    else:
        return row.value

income['value_adj'] = income.apply(income_adjust, axis = 1)

In [None]:
keep_me = ['GEOID', 'year', 'variable', 'table', 'main_var', 'second_var', 
           'new_var', 'value_adj', 'var_type']

income = income[keep_me]

In [None]:
# Update the var_type column now that values have been adjusted
def income_type_new(row):
    if (row.main_var=='hh') & (row.second_var=='hh'):
        return 'number'
    elif (row.main_var == 'medincome'):
        return 'dollar'
    elif (row.main_var=='hh') & (row.second_var != 'hh'):
        return 'number'
    
income['var_type'] = income.apply(income_type_new, axis = 1)

## Education

In [None]:
edu = df[df.table=='edu']

In [None]:
def edu_type(row):
    if (row.second_var.find('pct') != -1) or (row.second_var.find('pov') != -1):
        return 'percent'
    elif row.second_var.find('medearning') != -1:
        return 'dollar'
    elif (row.second_var.find('pct') == -1) & (row.second_var.find('medearning') == -1):
        return 'number'
    
edu['var_type'] = edu.apply(edu_type, axis = 1)

In [None]:
# Replace values that were percents with numbers
def edu_adjust(row):
    if (row.var_type=='percent'):
        return row.value / 100
    else:
        return row.value

edu['value_adj'] = edu.apply(edu_adjust, axis = 1)

In [None]:
keep_me = ['GEOID', 'year', 'variable', 'table', 'main_var', 'second_var', 
           'new_var', 'value_adj', 'var_type']

edu = edu[keep_me]

## Population and Housing Units

In [None]:
pop_housing = df[(df.table=='pop') | (df.table=='housing') ]

In [None]:
def pop_housing_type(row):
    if row.second_var.find('tot') != -1:
        return 'number'
    
pop_housing['var_type'] = pop_housing.apply(pop_housing_type, axis = 1)

In [None]:
# Create value_adj column (no adjustments needed, but need this so later can incorporate into loop)
def pop_housing_adjust(row):
    if row.var_type=='percent':
        return row.value / 100
    else:
        return row.value

pop_housing['value_adj'] = pop_housing.apply(pop_housing_adjust, axis = 1)

In [None]:
keep_me = ['GEOID', 'year', 'variable', 'table', 'main_var', 'second_var', 
           'new_var', 'value_adj', 'var_type']

pop_housing = pop_housing[keep_me]

## Poverty

In [None]:
povfam = df[df.table=='povfam']

In [None]:
def povfam_type(row):
    if row.main_var.find('pov') != -1:
        return 'percent'
    else:
        return 'number'
    
povfam['var_type'] = povfam.apply(povfam_type, axis = 1)

In [None]:
povfam['denom'] = povfam.apply(lambda row: row.value if row.new_var=='families' else np.nan, axis = 1)
povfam['denom'] = povfam['denom'].fillna(povfam.groupby(['GEOID', 'year'])['denom'].transform('max'))

In [None]:
# Replace values that were percents with numbers
def povfam_adjust(row):
    if (row.var_type=='percent'):
        return (row.value/100) * row.denom
    else:
        return row.value

povfam['value_adj'] = povfam.apply(povfam_adjust, axis = 1)

In [None]:
# Structure df so that there are number and percent columns, with values filled in
povfam['value_pct'] = povfam.value_adj / povfam.denom

In [None]:
povfam

In [None]:
"""
Food stamps table has inconsistent values.
2010-2014, hh in poverty is given in %. same as 
2015-2017, hh in poverty is given as numbers.
Do each table separately.

Store column that tells me if they are numbers, percents, or dollars.
"""
food = df[df.table=='food']
food[food.year==2012]

In [None]:
df.to_stata('../data/raw_census_long.dta')

In [None]:
pubassist = df[df.table=='pubassist']
# These are number of families that received food stamps
# Need to derive percents

In [None]:
pubassist