# etl_us_census.ipynb

Preprocess raw data from the U.S. Census Bureau into a format amenable for joining with the Johns Hopkins county-level data for the United States.

Inputs:
* `inputs/ACSDP5Y2018.DP02`: Selected Social Characteristics in the United States data for 2018, from the U.S. Census Bureau. Available at [data.census.gov](https://data.census.gov/cedsci/table?layer=county&d=ACS%205-Year%20Estimates%20Data%20Profiles&g=0100000US.050000&hidePreview=false&vintage=2018&tid=ACSDP5Y2018.DP02)
* `inputs/ACSST1Y2018.S1901`: Income in the Past 12 Months from the U.S. Census Bureau ACS 1-Year Estimates Subject tables. Available at [data.census.gov](https://data.census.gov/cedsci/table?q=income%20by%20county&g=0100000US.050000&hidePreview=true&tid=ACSST1Y2018.S1901&t=Income%20%28Households,%20Families,%20Individuals%29&moe=false)

Each input directory contains the raw output of the `data.census.gov` "download" function.

Outputs:
* `outputs/us_counties_income.csv`: Income statistics by county, extracted from `data/ACSST1Y2018.S1901`.

**Note:** You can redirect these input and output files by setting the environment variables `COVID_INTPUTS_DIR` and `COVID_OUTPUTS_DIR` to replacement values for the prefixes `inputs` and `outputs`, respectively, in the above paths.

In [1]:
# Initialization boilerplate
import io
import os
import pandas as pd
import numpy as np
import regex
from typing import *

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_INPUTS_DIR = os.getenv("COVID_INPUTS_DIR", "inputs")
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# Income data set

In [2]:
# Read the Income in the Past 12 Months data set's first CSV file

# The download included two apparently identical files:
# ACSST5Y2018.S1901_data_with_overlays_2020-04-26T101114.csv
# ACSST1Y2018.S1901_data_with_overlays_2020-04-26T101114.csv
# Pick one.
from os import path
file_dir = "ACSST1Y2018.S1901"
file_name = "ACSST5Y2018.S1901_data_with_overlays_2020-04-26T101114.csv"

income_data_file = path.join(_INPUTS_DIR, file_dir, file_name )


# This file has two header rows. The first row contains a short name for each
# column, such as "DP02_0001E", while the second row contains extra-long names
# like "Estimate!!HOUSEHOLDS BY TYPE!!Total households"
raw_income = pd.read_csv(income_data_file, header=[0,1], low_memory=False)
raw_income

Unnamed: 0_level_0,GEO_ID,NAME,S1901_C01_001E,S1901_C01_001M,S1901_C02_001E,S1901_C02_001M,S1901_C03_001E,S1901_C03_001M,S1901_C04_001E,S1901_C04_001M,...,S1901_C04_015E,S1901_C04_015M,S1901_C01_016E,S1901_C01_016M,S1901_C02_016E,S1901_C02_016M,S1901_C03_016E,S1901_C03_016M,S1901_C04_016E,S1901_C04_016M
Unnamed: 0_level_1,id,Geographic Area Name,Estimate!!Households!!Total,Margin of Error!!Households MOE!!Total,Estimate!!Families!!Total,Margin of Error!!Families MOE!!Total,Estimate!!Married-couple families!!Total,Margin of Error!!Married-couple families MOE!!Total,Estimate!!Nonfamily households!!Total,Margin of Error!!Nonfamily households MOE!!Total,...,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Nonfamily households MOE!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Households MOE!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Families MOE!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Married-couple families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Married-couple families MOE!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Nonfamily households MOE!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months
0,0500000US01001,"Autauga County, Alabama",21115.0,383.0,15161.0,488.0,11988.0,495.0,5954.0,523.0,...,(X),(X),(X),(X),(X),(X),(X),(X),30.7,(X)
1,0500000US01003,"Baldwin County, Alabama",78622.0,1183.0,51359.0,1309.0,41452.0,1207.0,27263.0,1223.0,...,(X),(X),(X),(X),(X),(X),(X),(X),29.6,(X)
2,0500000US01005,"Barbour County, Alabama",9186.0,280.0,6030.0,284.0,3908.0,237.0,3156.0,280.0,...,(X),(X),(X),(X),(X),(X),(X),(X),28.9,(X)
3,0500000US01007,"Bibb County, Alabama",6840.0,321.0,4947.0,366.0,3626.0,370.0,1893.0,255.0,...,(X),(X),(X),(X),(X),(X),(X),(X),40.7,(X)
4,0500000US01009,"Blount County, Alabama",20600.0,396.0,15104.0,429.0,11942.0,475.0,5496.0,467.0,...,(X),(X),(X),(X),(X),(X),(X),(X),30.9,(X)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",18081.0,547.0,12747.0,633.0,7630.0,582.0,5334.0,469.0,...,(X),(X),(X),(X),(X),(X),(X),(X),31.9,(X)
3216,0500000US72147,"Vieques Municipio, Puerto Rico",2470.0,266.0,1350.0,251.0,772.0,192.0,1120.0,214.0,...,(X),(X),(X),(X),(X),(X),(X),(X),16.1,(X)
3217,0500000US72149,"Villalba Municipio, Puerto Rico",7712.0,261.0,5658.0,295.0,2920.0,282.0,2054.0,249.0,...,(X),(X),(X),(X),(X),(X),(X),(X),18.3,(X)
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",11722.0,400.0,7851.0,462.0,4039.0,364.0,3871.0,415.0,...,(X),(X),(X),(X),(X),(X),(X),(X),15.7,(X)


In [3]:
# List out all 130 long-form column names
for c in raw_income.columns:
    print(c[1])

id
Geographic Area Name
Estimate!!Households!!Total
Margin of Error!!Households MOE!!Total
Estimate!!Families!!Total
Margin of Error!!Families MOE!!Total
Estimate!!Married-couple families!!Total
Margin of Error!!Married-couple families MOE!!Total
Estimate!!Nonfamily households!!Total
Margin of Error!!Nonfamily households MOE!!Total
Estimate!!Households!!Total!!Less than $10,000
Margin of Error!!Households MOE!!Total!!Less than $10,000
Estimate!!Families!!Total!!Less than $10,000
Margin of Error!!Families MOE!!Total!!Less than $10,000
Estimate!!Married-couple families!!Total!!Less than $10,000
Margin of Error!!Married-couple families MOE!!Total!!Less than $10,000
Estimate!!Nonfamily households!!Total!!Less than $10,000
Margin of Error!!Nonfamily households MOE!!Total!!Less than $10,000
Estimate!!Households!!Total!!$10,000 to $14,999
Margin of Error!!Households MOE!!Total!!$10,000 to $14,999
Estimate!!Families!!Total!!$10,000 to $14,999
Margin of Error!!Families MOE!!Total!!$10,000 to $1

In [4]:
# Split the composite column names into their constituent fields.
# To do this, we generate a new dataframe.
names_to_skip = ["id", "Geographic Area Name"]

short_names = [n[0] for n in raw_income.columns
               if n[1] not in names_to_skip]
long_names  = [n[1] for n in raw_income.columns
               if n[1] not in names_to_skip]
names_lists = [n.split("!!") for n in long_names]

# There are different levels of drill-down present in the names.
# Pad out the shorter levels with nulls.
max_len = max([len(l) for l in names_lists])
names_padded = [l + (max_len - len(l)) * [None] for l in names_lists]
names_arr = np.array(names_padded)

# Now we can create our dataframe
names = pd.DataFrame(names_arr)
names.insert(0, "FullName", long_names)
names.insert(0, "ShortName", short_names)
names

Unnamed: 0,ShortName,FullName,0,1,2,3
0,S1901_C01_001E,Estimate!!Households!!Total,Estimate,Households,Total,
1,S1901_C01_001M,Margin of Error!!Households MOE!!Total,Margin of Error,Households MOE,Total,
2,S1901_C02_001E,Estimate!!Families!!Total,Estimate,Families,Total,
3,S1901_C02_001M,Margin of Error!!Families MOE!!Total,Margin of Error,Families MOE,Total,
4,S1901_C03_001E,Estimate!!Married-couple families!!Total,Estimate,Married-couple families,Total,
...,...,...,...,...,...,...
123,S1901_C02_016M,Margin of Error!!Families MOE!!PERCENT ALLOCAT...,Margin of Error,Families MOE,PERCENT ALLOCATED,Nonfamily income in the past 12 months
124,S1901_C03_016E,Estimate!!Married-couple families!!PERCENT ALL...,Estimate,Married-couple families,PERCENT ALLOCATED,Nonfamily income in the past 12 months
125,S1901_C03_016M,Margin of Error!!Married-couple families MOE!!...,Margin of Error,Married-couple families MOE,PERCENT ALLOCATED,Nonfamily income in the past 12 months
126,S1901_C04_016E,Estimate!!Nonfamily households!!PERCENT ALLOCA...,Estimate,Nonfamily households,PERCENT ALLOCATED,Nonfamily income in the past 12 months


In [5]:
# Give these columns some more descriptive names.
names = names.rename(columns={
    0: "ValueType1", 
    1: "Category1",
    2: "ValueType2",
    3: "Category2",
})
names


Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2
0,S1901_C01_001E,Estimate!!Households!!Total,Estimate,Households,Total,
1,S1901_C01_001M,Margin of Error!!Households MOE!!Total,Margin of Error,Households MOE,Total,
2,S1901_C02_001E,Estimate!!Families!!Total,Estimate,Families,Total,
3,S1901_C02_001M,Margin of Error!!Families MOE!!Total,Margin of Error,Families MOE,Total,
4,S1901_C03_001E,Estimate!!Married-couple families!!Total,Estimate,Married-couple families,Total,
...,...,...,...,...,...,...
123,S1901_C02_016M,Margin of Error!!Families MOE!!PERCENT ALLOCAT...,Margin of Error,Families MOE,PERCENT ALLOCATED,Nonfamily income in the past 12 months
124,S1901_C03_016E,Estimate!!Married-couple families!!PERCENT ALL...,Estimate,Married-couple families,PERCENT ALLOCATED,Nonfamily income in the past 12 months
125,S1901_C03_016M,Margin of Error!!Married-couple families MOE!!...,Margin of Error,Married-couple families MOE,PERCENT ALLOCATED,Nonfamily income in the past 12 months
126,S1901_C04_016E,Estimate!!Nonfamily households!!PERCENT ALLOCA...,Estimate,Nonfamily households,PERCENT ALLOCATED,Nonfamily income in the past 12 months


In [6]:
names["ValueType2"].unique()

array(['Total', 'Median income (dollars)', 'Mean income (dollars)',
       'PERCENT ALLOCATED'], dtype=object)

In [7]:
# Filter out information about confidence intervals
names[(names["ValueType1"] == "Estimate") 
      & ~(names["ValueType2"].isin(["PERCENT ALLOCATED"]))
      & ~(names["Category2"].isna() & (names["ValueType2"] == "Total"))]

Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2
8,S1901_C01_002E,"Estimate!!Households!!Total!!Less than $10,000",Estimate,Households,Total,"Less than $10,000"
10,S1901_C02_002E,"Estimate!!Families!!Total!!Less than $10,000",Estimate,Families,Total,"Less than $10,000"
12,S1901_C03_002E,Estimate!!Married-couple families!!Total!!Less...,Estimate,Married-couple families,Total,"Less than $10,000"
14,S1901_C04_002E,Estimate!!Nonfamily households!!Total!!Less th...,Estimate,Nonfamily households,Total,"Less than $10,000"
16,S1901_C01_003E,"Estimate!!Households!!Total!!$10,000 to $14,999",Estimate,Households,Total,"$10,000 to $14,999"
18,S1901_C02_003E,"Estimate!!Families!!Total!!$10,000 to $14,999",Estimate,Families,Total,"$10,000 to $14,999"
20,S1901_C03_003E,"Estimate!!Married-couple families!!Total!!$10,...",Estimate,Married-couple families,Total,"$10,000 to $14,999"
22,S1901_C04_003E,"Estimate!!Nonfamily households!!Total!!$10,000...",Estimate,Nonfamily households,Total,"$10,000 to $14,999"
24,S1901_C01_004E,"Estimate!!Households!!Total!!$15,000 to $24,999",Estimate,Households,Total,"$15,000 to $24,999"
26,S1901_C02_004E,"Estimate!!Families!!Total!!$15,000 to $24,999",Estimate,Families,Total,"$15,000 to $24,999"


In [8]:
# Histogram data is stored in columns where ValueType1 is "Estimate" 
# and ValueType2 is "Total" ==> Histograms.

# Other columns give information about the quality of the histogram
# counts: "Margin of Error" in the ValueType1 column to indicate
# confidence interval width; and "PERCENT ALLOCATED" in the "ValueType2"
# column to indicate sample coverage.
#
# For the purposes of initial exploration, we'll extract out just the 
# counts for now. Note that any in-depth analysis you do on these 
# histograms should use the data quality metrics to estimate the error 
# of your final results.
hist_cols_mask = ((names["ValueType1"] == "Estimate")
                  & (names["ValueType2"] == "Total"))
names[hist_cols_mask]

Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2
0,S1901_C01_001E,Estimate!!Households!!Total,Estimate,Households,Total,
2,S1901_C02_001E,Estimate!!Families!!Total,Estimate,Families,Total,
4,S1901_C03_001E,Estimate!!Married-couple families!!Total,Estimate,Married-couple families,Total,
6,S1901_C04_001E,Estimate!!Nonfamily households!!Total,Estimate,Nonfamily households,Total,
8,S1901_C01_002E,"Estimate!!Households!!Total!!Less than $10,000",Estimate,Households,Total,"Less than $10,000"
10,S1901_C02_002E,"Estimate!!Families!!Total!!Less than $10,000",Estimate,Families,Total,"Less than $10,000"
12,S1901_C03_002E,Estimate!!Married-couple families!!Total!!Less...,Estimate,Married-couple families,Total,"Less than $10,000"
14,S1901_C04_002E,Estimate!!Nonfamily households!!Total!!Less th...,Estimate,Nonfamily households,Total,"Less than $10,000"
16,S1901_C01_003E,"Estimate!!Households!!Total!!$10,000 to $14,999",Estimate,Households,Total,"$10,000 to $14,999"
18,S1901_C02_003E,"Estimate!!Families!!Total!!$10,000 to $14,999",Estimate,Families,Total,"$10,000 to $14,999"


In [9]:
# Histogram values with with None in Category2 are totals.
# The remaining values are broken down into a hierarchy of nested
# sets:
#
# Households
#   +- Families
#      +- Married-couple families
#      +- [Families - Married-couple families] (not included)
#   +- Nonfamily households
#
# For now we'll just use the "Households" numbers.
#
# The values in the table are labeled as "Total" values, but they're actually
# percentages. Give them appropriate names.
#
# TODO: Use the leaf nodes of the above hierarchy, including the one that
#  needs to be generated by subtracting (Families - Married-couple families)
hist_to_retain = names[hist_cols_mask 
                  & (names["Category1"] == "Households")
                  & (~names["Category2"].isna())].copy()
hist_to_retain["NewName"] = "Percent " + hist_to_retain["Category2"]
hist_to_retain

Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2,NewName
8,S1901_C01_002E,"Estimate!!Households!!Total!!Less than $10,000",Estimate,Households,Total,"Less than $10,000","Percent Less than $10,000"
16,S1901_C01_003E,"Estimate!!Households!!Total!!$10,000 to $14,999",Estimate,Households,Total,"$10,000 to $14,999","Percent $10,000 to $14,999"
24,S1901_C01_004E,"Estimate!!Households!!Total!!$15,000 to $24,999",Estimate,Households,Total,"$15,000 to $24,999","Percent $15,000 to $24,999"
32,S1901_C01_005E,"Estimate!!Households!!Total!!$25,000 to $34,999",Estimate,Households,Total,"$25,000 to $34,999","Percent $25,000 to $34,999"
40,S1901_C01_006E,"Estimate!!Households!!Total!!$35,000 to $49,999",Estimate,Households,Total,"$35,000 to $49,999","Percent $35,000 to $49,999"
48,S1901_C01_007E,"Estimate!!Households!!Total!!$50,000 to $74,999",Estimate,Households,Total,"$50,000 to $74,999","Percent $50,000 to $74,999"
56,S1901_C01_008E,"Estimate!!Households!!Total!!$75,000 to $99,999",Estimate,Households,Total,"$75,000 to $99,999","Percent $75,000 to $99,999"
64,S1901_C01_009E,"Estimate!!Households!!Total!!$100,000 to $149,999",Estimate,Households,Total,"$100,000 to $149,999","Percent $100,000 to $149,999"
72,S1901_C01_010E,"Estimate!!Households!!Total!!$150,000 to $199,999",Estimate,Households,Total,"$150,000 to $199,999","Percent $150,000 to $199,999"
80,S1901_C01_011E,"Estimate!!Households!!Total!!$200,000 or more",Estimate,Households,Total,"$200,000 or more","Percent $200,000 or more"


In [10]:
# Columns where ValueType1 is "Estimate" and Category2 is None and
# ValueType2 is not "Total" contain summary statistics about income
# broken down by household type.
# names[(names["ValueType1"] == "Estimate") 
#       & ~(names["ValueType2"].isin(["PERCENT ALLOCATED"]))
#       & ~(names["Category2"].isna() & (names["ValueType2"] == "Total"))]
stats_cols_mask = ((names["ValueType1"] == "Estimate") 
      & (names["Category2"].isna()) & ~(names["ValueType2"] == "Total"))
names[stats_cols_mask]

Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2
88,S1901_C01_012E,Estimate!!Households!!Median income (dollars),Estimate,Households,Median income (dollars),
90,S1901_C02_012E,Estimate!!Families!!Median income (dollars),Estimate,Families,Median income (dollars),
92,S1901_C03_012E,Estimate!!Married-couple families!!Median inco...,Estimate,Married-couple families,Median income (dollars),
94,S1901_C04_012E,Estimate!!Nonfamily households!!Median income ...,Estimate,Nonfamily households,Median income (dollars),
96,S1901_C01_013E,Estimate!!Households!!Mean income (dollars),Estimate,Households,Mean income (dollars),
98,S1901_C02_013E,Estimate!!Families!!Mean income (dollars),Estimate,Families,Mean income (dollars),
100,S1901_C03_013E,Estimate!!Married-couple families!!Mean income...,Estimate,Married-couple families,Mean income (dollars),
102,S1901_C04_013E,Estimate!!Nonfamily households!!Mean income (d...,Estimate,Nonfamily households,Mean income (dollars),


In [11]:
# The mean and median incomes are broken down according to the same 
# weird hierarchy as the histograms. Just retain the top-level 
# stats for now.
stats_to_retain = names[stats_cols_mask 
                  & (names["Category1"] == "Households")].copy()
stats_to_retain["NewName"] = stats_to_retain["ValueType2"]
stats_to_retain

Unnamed: 0,ShortName,FullName,ValueType1,Category1,ValueType2,Category2,NewName
88,S1901_C01_012E,Estimate!!Households!!Median income (dollars),Estimate,Households,Median income (dollars),,Median income (dollars)
96,S1901_C01_013E,Estimate!!Households!!Mean income (dollars),Estimate,Households,Mean income (dollars),,Mean income (dollars)


In [12]:
# Now we're ready to start constructing our data table.

# Start with the metadata columns.

# The first column we need is FIPS county code that the Johns Hopkins 
# COVID-19 data uses as a primary key. The Census data sets we deal 
# with here use INCITS-31 codes (see 
# https://standards.incits.org/apps/group_public/project/details.php?project_id=204)
# Since we know every row in this data set is a U.S. county, we can 
# generate FIPS codes by just taking the last 5 characters and converting
# to an integer.
def incits_31_to_fips(code: str) -> int:
    """
    Convert an INCITS-31 code for a US county to a county FIPS code
    """
    # TODO: Verify that the input really is the INCITS-31 code for a 
    # U.S. county
    return int(code[-5:])

# Likewise, the human-readable county names are encoded as <county name> County, <state>.
def name_to_county_state(name: str) -> Tuple[str,str]:
    """
    Extract county and state names from the Geographic Area Name column
    values.
    
    Returns a tuple of (county name, state name)
    """
    # TODO: Validate the format of the input string
    match = regex.match(r"(.*) (County|Parish), (.*)", name)
    if match is None:
        # Try for a looser match.
        looser_match = regex.match(r"(.*), (.*)", name)
        if looser_match is None:
            return None, None
        else:
            return looser_match[1], looser_match[2]
    else:
        return match[1], match[3]
    

income = pd.DataFrame({
    # Note how we we need to use tuples to access the series in raw_income
    "FIPS": raw_income[("GEO_ID", "id")].apply(incits_31_to_fips),
    "County": raw_income[("NAME", "Geographic Area Name")].apply(
        lambda n: name_to_county_state(n)[0]),
    "State": raw_income[("NAME", "Geographic Area Name")].apply(
        lambda n: name_to_county_state(n)[1]),
})
income

Unnamed: 0,FIPS,County,State
0,1001,Autauga,Alabama
1,1003,Baldwin,Alabama
2,1005,Barbour,Alabama
3,1007,Bibb,Alabama
4,1009,Blount,Alabama
...,...,...,...
3215,72145,Vega Baja Municipio,Puerto Rico
3216,72147,Vieques Municipio,Puerto Rico
3217,72149,Villalba Municipio,Puerto Rico
3218,72151,Yabucoa Municipio,Puerto Rico


In [13]:
# Add the columns we've chosen to retain
for i in range(len(hist_to_retain.index)):
    row = hist_to_retain.iloc[i]
    income[row["NewName"]] = raw_income[(row["ShortName"], row["FullName"])]
    
for i in range(len(stats_to_retain.index)):
    row = stats_to_retain.iloc[i]
    income[row["NewName"]] = raw_income[(row["ShortName"], row["FullName"])]
    
income

Unnamed: 0,FIPS,County,State,"Percent Less than $10,000","Percent $10,000 to $14,999","Percent $15,000 to $24,999","Percent $25,000 to $34,999","Percent $35,000 to $49,999","Percent $50,000 to $74,999","Percent $75,000 to $99,999","Percent $100,000 to $149,999","Percent $150,000 to $199,999","Percent $200,000 or more",Median income (dollars),Mean income (dollars)
0,1001,Autauga,Alabama,7.1,5.8,10.1,7.9,11.9,18.1,14.1,16.6,5.5,3.1,58786.0,75515.0
1,1003,Baldwin,Alabama,5.4,5.2,9.2,11.0,13.5,19.3,12.3,13.6,5.5,5.1,55962.0,77212.0
2,1005,Barbour,Alabama,13.3,8.9,16.5,12.1,14.2,14.3,8.8,9.1,1.3,1.4,34186.0,47909.0
3,1007,Bibb,Alabama,6.4,8.2,12.9,11.1,15.0,16.7,14.6,10.2,3.1,1.7,45340.0,58529.0
4,1009,Blount,Alabama,8.4,5.6,11.4,12.6,13.1,20.9,11.9,10.6,3.5,1.9,48695.0,60646.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,72145,Vega Baja Municipio,Puerto Rico,28.9,12.8,17.6,13.3,10.0,11.5,3.4,1.7,0.5,0.4,19096.0,27927.0
3216,72147,Vieques Municipio,Puerto Rico,28.8,19.8,16.7,13.8,12.5,6.2,0.4,1.7,0.0,0.0,15539.0,22635.0
3217,72149,Villalba Municipio,Puerto Rico,24.3,14.0,20.0,14.9,13.2,9.2,2.5,1.5,0.2,0.2,19855.0,29895.0
3218,72151,Yabucoa Municipio,Puerto Rico,33.7,13.8,18.4,14.7,8.9,6.5,2.2,1.5,0.1,0.1,16013.0,22916.0


In [14]:
# Write out a CSV file of income information.
income_data_file = os.path.join(_OUTPUTS_DIR, "us_counties_income.csv")
print(f"Writing income data to {income_data_file}")
income.to_csv(income_data_file, index=False)

Writing income data to outputs/us_counties_income.csv


# Social Characteristics in the United States data set

## NOTE: This part of the notebook is not yet complete.

In [15]:
# Read the Social Characteristics in the United States data set's main CSV file.
social_characteristics_data_file = os.path.join(_INPUTS_DIR, "ACSDP5Y2018.DP02"
    "/ACSDP5Y2018.DP02_data_with_overlays_2020-04-26T100043.csv")

# This file has two header rows. The first row contains a short name for each
# column, such as "DP02_0001E", while the second row contains extra-long names
# like "Estimate!!HOUSEHOLDS BY TYPE!!Total households"
raw_soc_chars = pd.read_csv(social_characteristics_data_file, header=[0,1], 
                            low_memory=False)
raw_soc_chars

Unnamed: 0_level_0,GEO_ID,NAME,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,DP02_0002PM,...,DP02_0150PE,DP02_0150PM,DP02_0151E,DP02_0151M,DP02_0151PE,DP02_0151PM,DP02_0152E,DP02_0152M,DP02_0152PE,DP02_0152PM
Unnamed: 0_level_1,id,Geographic Area Name,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Margin of Error!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Percent Margin of Error!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),...,Percent Estimate!!COMPUTERS AND INTERNET USE!!Total households,Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households,Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Percent Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Percent Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription
0,0500000US01001,"Autauga County, Alabama",21115.0,383.0,21115.0,(X),15161.0,488.0,71.8,2.3,...,21115.0,(X),18368.0,482.0,87.0,1.7,16651.0,500.0,78.9,1.9
1,0500000US01003,"Baldwin County, Alabama",78622.0,1183.0,78622.0,(X),51359.0,1309.0,65.3,1.4,...,78622.0,(X),69635.0,1314.0,88.6,0.8,61424.0,1421.0,78.1,1.3
2,0500000US01005,"Barbour County, Alabama",9186.0,280.0,9186.0,(X),6030.0,284.0,65.6,2.7,...,9186.0,(X),6986.0,236.0,76.1,2.2,5548.0,273.0,60.4,2.7
3,0500000US01007,"Bibb County, Alabama",6840.0,321.0,6840.0,(X),4947.0,366.0,72.3,3.7,...,6840.0,(X),5217.0,376.0,76.3,3.6,4521.0,362.0,66.1,3.9
4,0500000US01009,"Blount County, Alabama",20600.0,396.0,20600.0,(X),15104.0,429.0,73.3,2.0,...,20600.0,(X),16202.0,526.0,78.7,2.0,14103.0,457.0,68.5,1.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3216,0500000US72147,"Vieques Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3217,0500000US72149,"Villalba Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,


In [16]:
# Types that Pandas inferred
raw_soc_chars.dtypes

GEO_ID       id                                                                                                                object
NAME         Geographic Area Name                                                                                              object
DP02_0001E   Estimate!!HOUSEHOLDS BY TYPE!!Total households                                                                   float64
DP02_0001M   Margin of Error!!HOUSEHOLDS BY TYPE!!Total households                                                            float64
DP02_0001PE  Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households                                                           float64
                                                                                                                               ...   
DP02_0151PM  Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a computer                           float64
DP02_0152E   Estimate!!COMPUTERS AND INTERNET USE!!Total house

In [17]:
# Every column is stored twice, once as an absolute number of households
# and once as a (sometimes incorrectly computed) percentage of households.
# Drop the percentages, since we can regenerate them on demand.
not_percent_cols = [c for c in raw_soc_chars.columns if not c[1].startswith("Percent")]

soc_chars_vals = raw_soc_chars[not_percent_cols]
soc_chars_vals

Unnamed: 0_level_0,GEO_ID,NAME,DP02_0001E,DP02_0001M,DP02_0002E,DP02_0002M,DP02_0003E,DP02_0003M,DP02_0004E,DP02_0004M,...,DP02_0148E,DP02_0148M,DP02_0149E,DP02_0149M,DP02_0150E,DP02_0150M,DP02_0151E,DP02_0151M,DP02_0152E,DP02_0152M
Unnamed: 0_level_1,id,Geographic Area Name,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Margin of Error!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years,Margin of Error!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family,Margin of Error!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family,...,Estimate!!ANCESTRY!!Total population!!Welsh,Margin of Error!!ANCESTRY!!Total population!!Welsh,Estimate!!ANCESTRY!!Total population!!West Indian (excluding Hispanic origin groups),Margin of Error!!ANCESTRY!!Total population!!West Indian (excluding Hispanic origin groups),Estimate!!COMPUTERS AND INTERNET USE!!Total households,Margin of Error!!COMPUTERS AND INTERNET USE!!Total households,Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a computer,Estimate!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription
0,0500000US01001,"Autauga County, Alabama",21115.0,383.0,15161.0,488.0,6787.0,434.0,11988.0,495.0,...,412.0,292.0,0.0,28.0,21115.0,383.0,18368.0,482.0,16651.0,500.0
1,0500000US01003,"Baldwin County, Alabama",78622.0,1183.0,51359.0,1309.0,18035.0,860.0,41452.0,1207.0,...,994.0,312.0,864.0,512.0,78622.0,1183.0,69635.0,1314.0,61424.0,1421.0
2,0500000US01005,"Barbour County, Alabama",9186.0,280.0,6030.0,284.0,2423.0,238.0,3908.0,237.0,...,48.0,57.0,33.0,26.0,9186.0,280.0,6986.0,236.0,5548.0,273.0
3,0500000US01007,"Bibb County, Alabama",6840.0,321.0,4947.0,366.0,1728.0,243.0,3626.0,370.0,...,94.0,83.0,0.0,22.0,6840.0,321.0,5217.0,376.0,4521.0,362.0
4,0500000US01009,"Blount County, Alabama",20600.0,396.0,15104.0,429.0,5566.0,331.0,11942.0,475.0,...,148.0,112.0,13.0,20.0,20600.0,396.0,16202.0,526.0,14103.0,457.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3216,0500000US72147,"Vieques Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3217,0500000US72149,"Villalba Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",,,,,,,,,...,,,,,,,,,,


In [18]:
# Split the composite column names into their constituent fields.
# To do this, we generate a new dataframe.
names_to_skip = ["id", "Geographic Area Name"]

# Work around a bug in Pandas by iterating over our list
# instead of the second level of the index of soc_chars_vals
names_to_split = [n[1] for n in not_percent_cols
                  if n[1] not in names_to_skip]
names_lists = [n.split("!!") for n in names_to_split]

# There are different levels of drill-down present in the names.
# Pad out the shorter levels with nulls.
max_len = max([len(l) for l in names_lists])
names_padded = [l + (max_len - len(l)) * [None] for l in names_lists]
names_arr = np.array(names_padded)

# Now we can create our dataframe
names = pd.DataFrame(names_arr)
names.insert(0, "FullName", names_to_split)
names

Unnamed: 0,FullName,0,1,2,3,4,5
0,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Estimate,HOUSEHOLDS BY TYPE,Total households,,,
1,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Margin of Error,HOUSEHOLDS BY TYPE,Total households,,,
2,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Estimate,HOUSEHOLDS BY TYPE,Total households,Family households (families),,
3,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Margin of Error,HOUSEHOLDS BY TYPE,Total households,Family households (families),,
4,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Estimate,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 ...,
...,...,...,...,...,...,...,...
299,Margin of Error!!COMPUTERS AND INTERNET USE!!T...,Margin of Error,COMPUTERS AND INTERNET USE,Total households,,,
300,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,Estimate,COMPUTERS AND INTERNET USE,Total households,With a computer,,
301,Margin of Error!!COMPUTERS AND INTERNET USE!!T...,Margin of Error,COMPUTERS AND INTERNET USE,Total households,With a computer,,
302,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,Estimate,COMPUTERS AND INTERNET USE,Total households,With a broadband Internet subscription,,


In [19]:
# Give these columns some more descriptive names.
names = names.rename(columns={
    0: "ValueType", 
    1: "Category",
    2: "Attribute"
})
names

Unnamed: 0,FullName,ValueType,Category,Attribute,3,4,5
0,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Estimate,HOUSEHOLDS BY TYPE,Total households,,,
1,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Margin of Error,HOUSEHOLDS BY TYPE,Total households,,,
2,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Estimate,HOUSEHOLDS BY TYPE,Total households,Family households (families),,
3,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Margin of Error,HOUSEHOLDS BY TYPE,Total households,Family households (families),,
4,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Estimate,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 ...,
...,...,...,...,...,...,...,...
299,Margin of Error!!COMPUTERS AND INTERNET USE!!T...,Margin of Error,COMPUTERS AND INTERNET USE,Total households,,,
300,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,Estimate,COMPUTERS AND INTERNET USE,Total households,With a computer,,
301,Margin of Error!!COMPUTERS AND INTERNET USE!!T...,Margin of Error,COMPUTERS AND INTERNET USE,Total households,With a computer,,
302,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,Estimate,COMPUTERS AND INTERNET USE,Total households,With a broadband Internet subscription,,


In [20]:
# The field we call "Category" breaks these 300 values into a manageable
# number of categories.
names["Category"].unique()

array(['HOUSEHOLDS BY TYPE', 'RELATIONSHIP', 'MARITAL STATUS',
       'FERTILITY', 'GRANDPARENTS', 'SCHOOL ENROLLMENT',
       'EDUCATIONAL ATTAINMENT', 'VETERAN STATUS',
       'DISABILITY STATUS OF THE CIVILIAN NONINSTITUTIONALIZED POPULATION',
       'RESIDENCE 1 YEAR AGO', 'PLACE OF BIRTH',
       'U.S. CITIZENSHIP STATUS', 'YEAR OF ENTRY',
       'WORLD REGION OF BIRTH OF FOREIGN BORN', 'LANGUAGE SPOKEN AT HOME',
       'ANCESTRY', 'COMPUTERS AND INTERNET USE'], dtype=object)

In [21]:
# The "ANCESTRY" category has only a 1-level drill-down.
names[names["Category"] == "ANCESTRY"]

Unnamed: 0,FullName,ValueType,Category,Attribute,3,4,5
242,Estimate!!ANCESTRY!!Total population,Estimate,ANCESTRY,Total population,,,
243,Margin of Error!!ANCESTRY!!Total population,Margin of Error,ANCESTRY,Total population,,,
244,Estimate!!ANCESTRY!!Total population!!American,Estimate,ANCESTRY,Total population,American,,
245,Margin of Error!!ANCESTRY!!Total population!!A...,Margin of Error,ANCESTRY,Total population,American,,
246,Estimate!!ANCESTRY!!Total population!!Arab,Estimate,ANCESTRY,Total population,Arab,,
247,Margin of Error!!ANCESTRY!!Total population!!Arab,Margin of Error,ANCESTRY,Total population,Arab,,
248,Estimate!!ANCESTRY!!Total population!!Czech,Estimate,ANCESTRY,Total population,Czech,,
249,Margin of Error!!ANCESTRY!!Total population!!C...,Margin of Error,ANCESTRY,Total population,Czech,,
250,Estimate!!ANCESTRY!!Total population!!Danish,Estimate,ANCESTRY,Total population,Danish,,
251,Margin of Error!!ANCESTRY!!Total population!!D...,Margin of Error,ANCESTRY,Total population,Danish,,


In [22]:
ancestry_names = ["Total" if n is None else n 
                  for n in names[names["Category"] == "ANCESTRY"][3].unique()]
ancestry_names

['Total',
 'American',
 'Arab',
 'Czech',
 'Danish',
 'Dutch',
 'English',
 'French (except Basque)',
 'French Canadian',
 'German',
 'Greek',
 'Hungarian',
 'Irish',
 'Italian',
 'Lithuanian',
 'Norwegian',
 'Polish',
 'Portuguese',
 'Russian',
 'Scotch-Irish',
 'Scottish',
 'Slovak',
 'Subsaharan African',
 'Swedish',
 'Swiss',
 'Ukrainian',
 'Welsh',
 'West Indian (excluding Hispanic origin groups)']

In [23]:
# TODO: Extract relevant fields and write out a CSV file.