# Check Census data and compile, clean, convert to the right units
* Census API doesn't seem to be working properly...check that all years have the same number of obs

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_parquet('../data/raw_census.parquet')

In [4]:
geoid = ['06037101110']
df = df[df.GEOID.isin(geoid)]

In [5]:
acs_tables = {
    'S1903': 'income', 
    'B19001': 'incomerange',
    'S0801': 'commute',
    'S0802': 'vehicles',
    'B25008': 'tenure',
    'B02001': 'race',
}

In [6]:
def tag_acs_table(df):
    pattern = re.compile('([A-Za-z0-9]+)_')
    
    df['table'] = df.apply(
        lambda row: acs_tables.get(pattern.match(row.variable).group(1)),
        axis = 1
    )
    
    # Find the other B19001A, B19001B, etc tables and tag them
    df['table'] = df.apply(
        lambda row: 'incomerange' if 'B19001' in row.variable else row.table,
        axis = 1
    )
    
    return df

In [7]:
df2 = tag_acs_table(df)

In [8]:
df2.table.value_counts()

incomerange    1530
commute         270
vehicles        180
income          180
race             72
tenure           27
Name: table, dtype: int64

In [None]:
"""
table_name = "vehicles"
df2 = df[(df.table==table_name)]

for year in range(2010, 2011):
    display(df2[df2.year==year].head(3))
    #display(df2[df2.year==year]['variable'].value_counts())
"""

In [30]:
# Tag main variable
def income_vars(row):
    if '_C01' in row.variable:
        return 'hh'
    elif '_C02' in row.variable:
        return 'medincome'
    elif '_C03' in row.variable:
        return 'medincome'
    
def incomerange_vars(row):
    if 'B19001_' in row.variable:
        return 'total'
    elif 'B19001A' in row.variable:
        return 'white'
    elif 'B19001B' in row.variable:
        return 'black'
    elif 'B19001C' in row.variable:
        return 'amerind'
    elif 'B19001D' in row.variable:
        return 'asian'
    elif 'B19001E' in row.variable:
        return 'pacis'
    elif 'B19001F' in row.variable:
        return 'other'
    elif 'B19001G' in row.variable:
        return 'race2'
    elif 'B19001H' in row.variable:
        return 'nonhisp'
    elif 'B19001I' in row.variable:
        return 'hisp'
    
def vehicle_vars(row):
    if '_001' in row.variable:
        return 'workers'

def commute_vars(row):
    if 'C01' in row.variable:
        return 'workers'
    elif 'C02' in row.variable:
        return 'male'
    elif 'C03' in row.variable:
        return 'female'

def tenure_vars(row):
    if 'B25008' in row.variable:
        return 'pop'
    
def race_vars(row):
    if 'B02001' in row.variable:
        return 'pop'
    
main_vars_dict = {
    'income': income_vars,
    'incomerange': incomerange_vars,
    'vehicles': vehicle_vars,
    'commute': commute_vars,
    'tenure': tenure_vars,
    'race': race_vars,
}

In [31]:
df2['main_var'] = df2.apply(lambda row: main_vars_dict[row['table']](row), axis = 1)

In [32]:
df2.main_var.value_counts()

total        153
black        153
white        153
hisp         153
amerind      153
asian        153
race2        153
pacis        153
nonhisp      153
other        153
workers      126
pop           99
male          90
medincome     90
hh            90
female        90
Name: main_var, dtype: int64

In [42]:
# Secondary variable - use last 2 characters
income = {'01': 'total', '02': 'white', '03': 'black', '04': 'amerind', '05': 'asian',
       '06': 'pacis', '07': 'other', '08': 'race2', '09': 'hisp', '10': 'nonhisp'}

incomerange = {'01': 'total', '02': 'lt10', '03': 'r10to14', '04': 'r15to19', '05': 'r20to24',
           '06': 'r25to29', '07': 'r30to34', '08': 'r35to39', '09': 'r40to44', '10': 'r45to49',
           '11': 'r50to59', '12': 'r60to74', '13': 'r75to99', '14': 'r100to124', '15': 'r125to149',
           '16': 'r150to199', '17': 'gt200'}

vehicles = {'01': 'total', '94': 'veh0', '95': 'veh1', '96': 'veh2', '97': 'veh3'}


commute = {'01': 'total', '03': 'car1', '05': 'car2', '06': 'car3', '07': 'car4',
          '09': 'transit', '10': 'walk', '11': 'bike', '12': 'other', '13': 'telecommute'}

tenure = {'01': 'total', '02': 'owner', '03': 'renter'}


race = {'01': 'total', '02': 'white', '03': 'black', '04': 'amerind', '05': 'asian',
           '06': 'pacis', '07': 'other', '08': 'race2'}

In [46]:
def tag_secondary_variable(df):    
    df['last2'] = df['variable'].str[-2:]
     
    def pick_secondary_var(row):
        if row.table=='income':
            return income[row.last2]
        elif row.table=='incomerange':
            return incomerange[row.last2]
        elif row.table=="vehicles":
            return vehicles[row.last2]
        elif row.table=="commute":
            return commute[row.last2]
        elif row.table=="tenure":
            return tenure[row.last2]
        elif row.table=="race":
            return race[row.last2]
   
    df['second_var'] = df.apply(pick_secondary_var, axis = 1)
   
    return df

In [47]:
df3 = tag_secondary_variable(df2)

In [48]:
df3

Unnamed: 0,GEOID,variable,estimate,year,table,main_var,last2,second_var
0,06037101110,S0801_C01_001,2362.0,2010,commute,workers,01,total
1,06037101110,S0801_C01_003,83.9,2010,commute,workers,03,car1
2,06037101110,S0801_C01_005,8.3,2010,commute,workers,05,car2
3,06037101110,S0801_C01_006,0.0,2010,commute,workers,06,car3
4,06037101110,S0801_C01_007,0.0,2010,commute,workers,07,car4
...,...,...,...,...,...,...,...,...
150147,06037101110,B02001_004,4.0,2018,race,pop,04,amerind
150148,06037101110,B02001_005,329.0,2018,race,pop,05,asian
150149,06037101110,B02001_006,4.0,2018,race,pop,06,pacis
150150,06037101110,B02001_007,499.0,2018,race,pop,07,other
