In [26]:
import pandas as pd
import geopandas as gpd
import geofeather
import os
import warnings

In [27]:
%pylab inline
warnings.filterwarnings('ignore')

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [28]:
user = 'aolsen'

In [29]:
DATA_PATH = f'/Users/{user}/Box/Modeling and Surveys/Census/2020/DHC'
DATA_PATH

'/Users/aolsen/Box/Modeling and Surveys/Census/2020/DHC'

In [30]:
bayareafips_full = {'06001': 'Alameda', '06013': 'Contra Costa', '06041': 'Marin', '06055': 'Napa', '06075': 'San Francisco',
                    '06081': 'San Mateo', '06085': 'Santa Clara', '06097': 'Sonoma', '06095': 'Solano'}


# Get table matrix and segment map

In [31]:
table_matrix_path = os.path.join(DATA_PATH,'2020-dhc-table-matrix.xlsx')

In [32]:
table_segment_map = pd.read_excel(table_matrix_path,'DHC Table_Segments')
table_segment_map.columns =table_segment_map.columns.str.replace('\n','')
table_segment_map.TABLE_ID = table_segment_map.TABLE_ID.str.strip()

In [33]:
table_segment_map.query('TABLE_ID=="P12" ')

Unnamed: 0,TABLE_ID,SEGMENT_NUMBER,TOTAL_RECORDS,TABLE_SORT_ORDER
69,P12,6,49,3


In [34]:
table_segment_map.query('TABLE_ID=="PCT7" ')

Unnamed: 0,TABLE_ID,SEGMENT_NUMBER,TOTAL_RECORDS,TABLE_SORT_ORDER
160,PCT7,18,16,7


In [35]:
# load list of tables from excel matrix

table_matrix = pd.read_excel(table_matrix_path, 'DHC Table Matrix', skiprows=1)
table_matrix = table_matrix.rename(columns={'Table number': 'TABLE_ID',
                                            'Table contents': 'DESCRIPTION',
                                            'Data dictionary reference name': 'TABLE_DETAIL'})
table_matrix = table_matrix[table_matrix['TABLE_ID'].notna()]

# some table names have trailing whitespaces - strip these
table_matrix.TABLE_ID = table_matrix.TABLE_ID.str.strip()

# grab the row that has the overall table title - use the bracketed number to identify
table_matrix['TABLE_NAME'] = table_matrix[table_matrix.DESCRIPTION.str.contains(
    '(\[\d{1,3}\])')].DESCRIPTION

# store and ffill 
table_matrix['TABLE_NAME'] = table_matrix['TABLE_NAME'].fillna(method='ffill')
table_matrix.sample(5)

Unnamed: 0,Person (P) or Housing (H),TABLE_ID,TABLE_DETAIL,Segment,Max size,DESCRIPTION,TABLE_NAME
7110,P,PCT13D,,,,SEX BY AGE FOR THE POPULATION IN HOUSEHOLDS (A...,SEX BY AGE FOR THE POPULATION IN HOUSEHOLDS (A...
7576,P,PCT17B,PCT17B026,39.0,9.0,Other relatives,HOUSEHOLD TYPE (INCLUDING LIVING ALONE) BY REL...
7837,P,PCT18A,PCT018A015,40.0,9.0,Correctional facilities for adults (101-106),GROUP QUARTERS POPULATION BY SEX BY AGE BY MAJ...
6597,P,PCT12N,PCT012N118,35.0,9.0,11 years,"SEX BY SINGLE-YEAR AGE (SOME OTHER RACE ALONE,..."
9098,H,H4P,H004P0001,1.0,9.0,Total:,"TENURE (WHITE ALONE, HISPANIC OR LATINO HOUSEH..."


### Data dictionary errors
I checked for duplicate table detailed IDs and found what appears to be a few errors in the census data dictionary - duplicate detailed table ids:
* P19 / P0190001 is just a repeat of the same id instead of incrementing the table number
* A second issue is PCO8 and PCT8, which both have the table identifiers PCO0080001..PCO0080025. The duplicates cross segment files, so less of a practical problem than with P19, but it appears that PCT8 just has wrong Data dictionary reference names (e.g. PCO0080001 instead of PCT0080001).

In [36]:
table_matrix[table_matrix['TABLE_DETAIL'].isin(table_matrix['TABLE_DETAIL'].value_counts()[table_matrix['TABLE_DETAIL'].value_counts()>1].index)].groupby(['TABLE_DETAIL','TABLE_ID','Segment']).size()

TABLE_DETAIL  TABLE_ID  Segment
P0190001      P19       15.0       11
PCO0080001    PCO8      17.0        1
              PCT8      19.0        1
PCO0080002    PCO8      17.0        1
              PCT8      19.0        1
PCO0080003    PCO8      17.0        1
              PCT8      19.0        1
PCO0080004    PCO8      17.0        1
              PCT8      19.0        1
PCO0080005    PCO8      17.0        1
              PCT8      19.0        1
PCO0080006    PCO8      17.0        1
              PCT8      19.0        1
PCO0080007    PCO8      17.0        1
              PCT8      19.0        1
PCO0080008    PCO8      17.0        1
              PCT8      19.0        1
PCO0080009    PCO8      17.0        1
              PCT8      19.0        1
PCO0080010    PCO8      17.0        1
              PCT8      19.0        1
PCO0080011    PCO8      17.0        1
              PCT8      19.0        1
PCO0080012    PCO8      17.0        1
              PCT8      19.0        1
PCO0080013    PCO8

## get table deets for field mapping

In [37]:
def get_table_headers(table_id='P12'):
    """
    Extracts the table data dictionary for a given table ID.

    Args:
        table_id (str, optional): The table ID. Defaults to 'P12'.

    Returns:
        dict: A dictionary mapping table details to variable names.
    """

    table_headers = table_matrix.query("TABLE_ID==@table_id")
    table_headers = table_headers[table_headers.TABLE_DETAIL.notna(
    )]

    # Develop hierarchy
    table_hierarchy_mask = (table_headers.DESCRIPTION.str.contains(
        ':')) * (~table_headers.DESCRIPTION.str.contains('Total'))
    table_headers.loc[table_hierarchy_mask,
                      'hierarchy'] = table_headers.loc[table_hierarchy_mask, 'DESCRIPTION']

    table_headers.hierarchy = table_headers.hierarchy.fillna(
        method='ffill')
    table_headers.hierarchy = table_headers.hierarchy.fillna('Total:')

    table_headers['variable'] = table_headers.apply(
        lambda x: f'{x.hierarchy} {x.DESCRIPTION}' if not x.hierarchy == x.DESCRIPTION else x.hierarchy, axis=1)

    table_map = table_headers.set_index('TABLE_DETAIL').variable.to_dict()
    return table_map

# Load helper data

In [38]:
# tabblock_path = '/Users/aolsen/Dropbox/Documents/Data/GIS/Census/blocks/2020/tl_2020_06_tabblock20/tl_2020_06_tabblock20.shp'
# ca_blocks = gpd.read_file(tabblock_path)

In [39]:
# ca_blocks.to_feather('/Users/aolsen/Dropbox/Documents/Data/GIS/Census/blocks/2020/tl_2020_06_tabblock20/tl_2020_06_tabblock20.feather')
ca_blocks_path = f'/Users/{user}/Box/Modeling and Surveys/Census/2020/geo/tl_2020_06_tabblock20.feather'
ca_blocks = gpd.read_feather(ca_blocks_path)

In [40]:
ca_blocks['STCOUNTY'] = ca_blocks.STATEFP20+ca_blocks.COUNTYFP20

# subset ca census blocks to bay area counties
bayarea_blocks = ca_blocks[ca_blocks.STCOUNTY.isin(bayareafips_full)]

bayarea_blocks = bayarea_blocks.to_crs('EPSG:26910')
bayarea_blocks['geom_pt'] = bayarea_blocks.geometry.representative_point()

In [41]:
# # bay area 2020 tracts

# censustracts_2020 = gpd.read_file(
#     f'/Users/{user}/Box/Modeling and Surveys/Census/2020/tracts/censustracts_bayarea_2020_v2.shp')
# censustracts_2020 = censustracts_2020.to_crs('EPSG:26910')

In [42]:
# Get TAZ zones

zones_path = f'/Users/{user}/Box/Modeling and Surveys/Urban Modeling/Spatial/Zones/TAZ1454/zones1454.shp'

zones = gpd.read_file(
    zones_path).to_crs('EPSG:26910')

In [43]:
# assign containing zone to census block
bayarea_blocks_x_zones = gpd.sjoin(
    bayarea_blocks.set_geometry('geom_pt'), zones)

In [44]:
# # Fetch pre-processed block geodata from redistricting data

# BLOCK_PATH = f'/Users/{user}/Box/Modeling and Surveys/Census/2020/tracts/bayarea_blocks.feather'

# bayarea_blocks = gpd.read_feather(BLOCK_PATH)
# bayarea_blocks.sum()

# census2020_blockdata = geofeather.from_geofeather(
#     f'/Users/{user}/Box/Modeling and Surveys/Census/2020/tracts/censusblocks_20202020_w_dat2.feather')
# census2020_blockdata['geom_pt'] = census2020_blockdata.representative_point()
# census2020_blockdata.sum()

## geoheader 2020

In [45]:
SUMLEV = {'tract': 140, 'block': 100, 'place': 160, 'cousub': 60}

DROP_COLS = ['CHARITER', 'CIFSN', 'LOGRECNO']

In [46]:
geo_header = ["FILEID", "STUSAB", "SUMLEV", "GEOVAR", "GEOCOMP", "CHARITER", "CIFSN", "LOGRECNO", "GEOID",
              "GEOCODE", "REGION", "DIVISION", "STATE", "STATENS", "COUNTY", "COUNTYCC", "COUNTYNS", "COUSUB",
              "COUSUBCC", "COUSUBNS", "SUBMCD", "SUBMCDCC", "SUBMCDNS", "ESTATE", "ESTATECC", "ESTATENS",
              "CONCIT", "CONCITCC", "CONCITNS", "PLACE", "PLACECC", "PLACENS", "TRACT", "BLKGRP", "BLOCK",
              "AIANHH", "AIHHTLI", "AIANHHFP", "AIANHHCC", "AIANHHNS", "AITS", "AITSFP", "AITSCC", "AITSNS",
              "TTRACT", "TBLKGRP", "ANRC", "ANRCCC", "ANRCNS", "CBSA", "MEMI", "CSA", "METDIV", "NECTA",
              "NMEMI", "CNECTA", "NECTADIV", "CBSAPCI", "NECTAPCI", "UA", "UATYPE", "UR", "CD116", "CD118",
              "CD119", "CD120", "CD121", "SLDU18", "SLDU22", "SLDU24", "SLDU26", "SLDU28", "SLDL18", "SLDL22",
              "SLDL24", "SLDL26", "SLDL28", "VTD", "VTDI", "ZCTA", "SDELM", "SDSEC", "SDUNI", "PUMA", "AREALAND",
              "AREAWATR", "BASENAME", "NAME", "FUNCSTAT", "GCUNI", "POP100", "HU100", "INTPTLAT", "INTPTLON",
              "LSADC", "PARTFLAG", "UGA"]

In [47]:
# get the geo file, with all logrecno's

file_geo_path = os.path.join(DATA_PATH,'ca2020.dhc','cageo2020.dhc')

filegeo = pd.read_csv(file_geo_path, sep='|',
                      header=None, encoding='windows-1252', names=geo_header, dtype={'GEOID': str, 'GEOCODE': str, 'COUNTY': str})
filegeo['STCOUNTY'] = '06'+filegeo.COUNTY

In [48]:
# block sumlev is oddly not in https://www.census.gov/programs-surveys/geography/technical-documentation/naming-convention/cartographic-boundary-file/carto-boundary-summary-level.html
# we assume it has geocode string length 15 and check the corresponding sumlev
filegeo[filegeo.GEOCODE.apply(lambda x: len(x) == 15)].SUMLEV.value_counts()

# so 100 most likely candidate for census block -  519,723 records in state

100    519723
70       2342
441       382
144       178
265       122
267        59
324         7
Name: SUMLEV, dtype: int64

In [49]:
# subset relevant LOGRECNOS using SUMLEV

bayareablocks = filegeo.loc[filegeo.SUMLEV == SUMLEV['block']
                            ].loc[filegeo.loc[filegeo.SUMLEV == SUMLEV['block']].STCOUNTY.isin(bayareafips_full)]
bayareablocks_logrecno = bayareablocks.set_index('LOGRECNO').GEOCODE

# mapping of logrecnos to census block
bayareablocks_logrecno

LOGRECNO
102452    060014001001000
102453    060014001001001
102454    060014001001002
102455    060014001001003
102456    060014001001004
               ...       
578365    060971543081015
578366    060979901000001
578367    060979901000002
578368    060979901000003
578369    060979901000004
Name: GEOCODE, Length: 82210, dtype: object

# Census data 

In [50]:
def segment_loader(segment):
    """
    Loads the census 2020 population and households characteristics file and subsets it to the San Francisco Bay Area census blocks.

    Args:
        segment (int): The segment number.

    Returns:
        pandas.DataFrame: Subset of the census data for the San Francisco Bay Area census blocks.
    """

    
    segment_path = os.path.join(DATA_PATH, f'ca2020.dhc/ca000{segment:02d}2020.dhc')

    identifiers = ["FILEID", "STUSAB", "CHARITER", "CIFSN", "LOGRECNO"]

    # get headers for segment, for csv load call
    segment_headers = table_matrix.query(
        "Segment==@segment").TABLE_DETAIL.tolist()

    
    segment_data = pd.read_csv(segment_path,
                               sep='|', header=None, names=identifiers+segment_headers)

    # subset to blocks

    segment_data_block = segment_data[segment_data.LOGRECNO.isin(
        bayareablocks_logrecno.index)]
    segment_data_block['GEOID20'] = segment_data_block.LOGRECNO.map(
        bayareablocks_logrecno)
    segment_data_block['TRACT'] = segment_data_block['GEOID20'].str.slice(
        0, 11)
    segment_data_block['COUNTY'] = segment_data_block['GEOID20'].str.slice(
        0, 5).map(bayareafips_full)

    segment_data_block['zone_id'] = segment_data_block.GEOID20.map(
        bayarea_blocks_x_zones.set_index('GEOID20').zone_id)

    segment_data_block = segment_data_block.set_index(
        ['COUNTY', 'TRACT', 'zone_id', 'GEOID20'])
    return segment_data_block

## Segment 6 - population by age and others

In [51]:
p12_map = get_table_headers('P12')
p12_map

{'P0120001': 'Total:',
 'P0120002': 'Male:',
 'P0120003': 'Male: Under 5 years',
 'P0120004': 'Male: 5 to 9 years',
 'P0120005': 'Male: 10 to 14 years',
 'P0120006': 'Male: 15 to 17 years',
 'P0120007': 'Male: 18 and 19 years',
 'P0120008': 'Male: 20 years',
 'P0120009': 'Male: 21 years',
 'P0120010': 'Male: 22 to 24 years',
 'P0120011': 'Male: 25 to 29 years',
 'P0120012': 'Male: 30 to 34 years',
 'P0120013': 'Male: 35 to 39 years',
 'P0120014': 'Male: 40 to 44 years',
 'P0120015': 'Male: 45 to 49 years',
 'P0120016': 'Male: 50 to 54 years',
 'P0120017': 'Male: 55 to 59 years',
 'P0120018': 'Male: 60 and 61 years',
 'P0120019': 'Male: 62 to 64 years',
 'P0120020': 'Male: 65 and 66 years',
 'P0120021': 'Male: 67 to 69 years',
 'P0120022': 'Male: 70 to 74 years',
 'P0120023': 'Male: 75 to 79 years',
 'P0120024': 'Male: 80 to 84 years',
 'P0120025': 'Male: 85 years and over',
 'P0120026': 'Female:',
 'P0120027': 'Female: Under 5 years',
 'P0120028': 'Female: 5 to 9 years',
 'P0120029': '

In [52]:
# bespoke mapping of census tables to TM age groups
p12_to_TM_map = {'P0120001': 'TOTPOP',
                 # 'P0120002': 'Male:',
                 'P0120003': 'AGE0004',
                 'P0120004': 'AGE0519',
                 'P0120005': 'AGE0519',
                 'P0120006': 'AGE0519',
                 'P0120007': 'AGE0519',
                 'P0120008': 'AGE2044',
                 'P0120009': 'AGE2044',
                 'P0120010': 'AGE2044',
                 'P0120011': 'AGE2044',
                 'P0120012': 'AGE2044',
                 'P0120013': 'AGE2044',
                 'P0120014': 'AGE2044',
                 'P0120015': 'AGE4564',
                 'P0120016': 'AGE4564',
                 'P0120017': 'AGE4564',
                 'P0120018': 'AGE4564',
                 'P0120019': 'AGE4564',
                 'P0120020': 'AGE65P',
                 'P0120021': 'AGE65P',
                 'P0120022': 'AGE65P',
                 'P0120023': 'AGE65P',
                 'P0120024': 'AGE65P',
                 'P0120025': 'AGE65P',
                 # 'P0120026': 'Female:',
                 'P0120027': 'AGE0004',
                 'P0120028': 'AGE0519',
                 'P0120029': 'AGE0519',
                 'P0120030': 'AGE0519',
                 'P0120031': 'AGE0519',
                 'P0120032': 'AGE2044',
                 'P0120033': 'AGE2044',
                 'P0120034': 'AGE2044',
                 'P0120035': 'AGE2044',
                 'P0120036': 'AGE2044',
                 'P0120037': 'AGE2044',
                 'P0120038': 'AGE2044',
                 'P0120039': 'AGE4564',
                 'P0120040': 'AGE4564',
                 'P0120041': 'AGE4564',
                 'P0120042': 'AGE4564',
                 'P0120043': 'AGE4564',
                 'P0120044': 'AGE65P',
                 'P0120045': 'AGE65P',
                 'P0120046': 'AGE65P',
                 'P0120047': 'AGE65P',
                 'P0120048': 'AGE65P',
                 'P0120049': 'AGE65P'}

In [53]:
segment_6_data_block = segment_loader(segment=6)
segment_6_data_block.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,P0100001,P0100002,P0100003,P0100004,P0100005,...,P012A040,P012A041,P012A042,P012A043,P012A044,P012A045,P012A046,P012A047,P012A048,P012A049
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Alameda,6001400100,1155.0,60014001001000,DHCST,CA,0,6,102452,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001001,DHCST,CA,0,6,102453,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001002,DHCST,CA,0,6,102454,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001003,DHCST,CA,0,6,102455,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001004,DHCST,CA,0,6,102456,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# rename columns to labeled categories
p12_labeled = segment_6_data_block[p12_map].rename(
    columns=p12_map)
p12_labeled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total:,Male:,Male: Under 5 years,Male: 5 to 9 years,Male: 10 to 14 years,Male: 15 to 17 years,Male: 18 and 19 years,Male: 20 years,Male: 21 years,Male: 22 to 24 years,...,Female: 50 to 54 years,Female: 55 to 59 years,Female: 60 and 61 years,Female: 62 to 64 years,Female: 65 and 66 years,Female: 67 to 69 years,Female: 70 to 74 years,Female: 75 to 79 years,Female: 80 to 84 years,Female: 85 years and over
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Alameda,06001400100,1155.0,060014001001000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sonoma,06097154308,1403.0,060971543081015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# rename and retegorize age categories to TM values

p12_labeled_tm = segment_6_data_block[p12_to_TM_map].rename(
    columns=p12_to_TM_map)

# collapse identically labeled columns
p12_labeled_tm = p12_labeled_tm.groupby(
    axis=1, level=0).sum()

p12_labeled_tm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,AGE0004,AGE0519,AGE2044,AGE4564,AGE65P,TOTPOP
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alameda,06001400100,1155.0,060014001001000,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001001,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001002,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001003,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001004,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
Sonoma,06097154308,1403.0,060971543081015,0,0,0,0,0,0
Sonoma,06097990100,,060979901000001,0,0,0,0,0,0
Sonoma,06097990100,,060979901000002,0,0,0,0,0,0
Sonoma,06097990100,,060979901000003,0,0,0,0,0,0


## Segment 1 - household tenure by size and others

In [56]:
h12_map = get_table_headers('H12')

In [57]:
segment_1_data_block = segment_loader(segment=1)
segment_1_data_block.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,H0010001,H0020001,H0020002,H0020003,H0020004,...,H012C0008,H012C0009,H012C0010,H012C0011,H012C0012,H012C0013,H012C0014,H012C0015,H012C0016,H012C0017
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Alameda,6001400100,1155.0,60014001001000,DHCST,CA,0,1,102452,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001001,DHCST,CA,0,1,102453,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001002,DHCST,CA,0,1,102454,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001003,DHCST,CA,0,1,102455,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001004,DHCST,CA,0,1,102456,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# rename columns to labeled categories
h12_labeled = segment_1_data_block[h12_map].rename(
    columns=h12_map)

In [59]:
h12_labeled_long = h12_labeled.stack().reset_index(name='value')
h12_labeled_long['tenure']=h12_labeled_long.level_4.str.extract('(Owner|Renter)',expand=False)
h12_labeled_long['hhsize']=h12_labeled_long.level_4.str.extract('(\d)',expand=False)
h12_labeled_long = h12_labeled_long[h12_labeled_long.hhsize.notna()]

In [60]:
h12_labeled_long.groupby(['COUNTY','tenure','hhsize']).value.sum().unstack(2)

Unnamed: 0_level_0,hhsize,1,2,3,4,5,6,7
COUNTY,tenure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alameda,Owner,54845,93646,58688,59456,21985,9978,7882
Alameda,Renter,87433,80505,45583,37294,18356,8808,7177
Contra Costa,Owner,50132,86527,46355,48157,20770,8783,6317
Contra Costa,Renter,37078,35969,24062,21613,11648,5428,4190
Marin,Owner,15246,24305,10019,10464,3327,848,423
Marin,Renter,14612,10701,5547,4666,2221,1024,764
Napa,Owner,6780,11571,4367,4112,1984,897,731
Napa,Renter,5673,4930,2981,2673,1678,811,550
San Francisco,Owner,33210,41297,20495,17736,6873,3591,3722
San Francisco,Renter,101524,79979,31525,18622,7097,3256,2924


In [61]:
h12_labeled_long.groupby(['COUNTY','hhsize']).value.sum()

COUNTY   hhsize
Alameda  1         142278
         2         174151
         3         104271
         4          96750
         5          40341
                    ...  
Sonoma   3          28679
         4          24850
         5          11601
         6           4942
         7           3604
Name: value, Length: 63, dtype: int64

## Segment 1 - population in households

In [62]:
h8_map = get_table_headers('H8')

# rename columns to labeled categories
h8_labeled = segment_1_data_block[h8_map].rename(
    columns=h8_map)
h8_labeled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total: Total
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1
Alameda,06001400100,1155.0,060014001001000,0
Alameda,06001400100,1005.0,060014001001001,0
Alameda,06001400100,1005.0,060014001001002,0
Alameda,06001400100,1005.0,060014001001003,0
Alameda,06001400100,1005.0,060014001001004,0
...,...,...,...,...
Sonoma,06097154308,1403.0,060971543081015,0
Sonoma,06097990100,,060979901000001,0
Sonoma,06097990100,,060979901000002,0
Sonoma,06097990100,,060979901000003,0


## Segment 13 - population in households
We can use household population to back out group quarter population which is not natively available at the census block level

In [63]:
p15_map = get_table_headers('P15')
p15_map

{'P0150001': 'Total:',
 'P0150002': 'Total: Under 18 years',
 'P0150003': 'Total: 18 years and over'}

In [64]:
segment_13_data_block = segment_loader(segment=13)
segment_13_data_block.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,P012X001,P012X002,P012X003,P012X004,P012X005,...,P016A009,P016B001,P016B002,P016B003,P016B004,P016B005,P016B006,P016B007,P016B008,P016B009
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Alameda,6001400100,1155.0,60014001001000,DHCST,CA,0,13,102452,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001001,DHCST,CA,0,13,102453,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001002,DHCST,CA,0,13,102454,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001003,DHCST,CA,0,13,102455,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alameda,6001400100,1005.0,60014001001004,DHCST,CA,0,13,102456,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
# rename columns to labeled categories

p15_labeled = segment_13_data_block[p15_map].rename(
    columns=p15_map)
p15_labeled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total:,Total: Under 18 years,Total: 18 years and over
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alameda,06001400100,1155.0,060014001001000,0,0,0
Alameda,06001400100,1005.0,060014001001001,0,0,0
Alameda,06001400100,1005.0,060014001001002,0,0,0
Alameda,06001400100,1005.0,060014001001003,0,0,0
Alameda,06001400100,1005.0,060014001001004,0,0,0
...,...,...,...,...,...,...
Sonoma,06097154308,1403.0,060971543081015,0,0,0
Sonoma,06097990100,,060979901000001,0,0,0
Sonoma,06097990100,,060979901000002,0,0,0
Sonoma,06097990100,,060979901000003,0,0,0


## Segment 13 - household types

In [66]:
p16_map = get_table_headers('P16')
p16_map

{'P0160001': 'Total:',
 'P0160002': 'Family households:',
 'P0160003': 'Family households: Married couple family',
 'P0160004': 'Other family:',
 'P0160005': 'Other family: Male householder, no spouse present',
 'P0160006': 'Other family: Female householder, no spouse present',
 'P0160007': 'Nonfamily households:',
 'P0160008': 'Nonfamily households: Householder living alone',
 'P0160009': 'Nonfamily households: Householder not living alone'}

In [67]:
# rename columns to labeled categories

p16_labeled = segment_13_data_block[p16_map].rename(
    columns=p16_map)
p16_labeled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total:,Family households:,Family households: Married couple family,Other family:,"Other family: Male householder, no spouse present","Other family: Female householder, no spouse present",Nonfamily households:,Nonfamily households: Householder living alone,Nonfamily households: Householder not living alone
COUNTY,TRACT,zone_id,GEOID20,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alameda,06001400100,1155.0,060014001001000,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001001,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001002,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001003,0,0,0,0,0,0,0,0,0
Alameda,06001400100,1005.0,060014001001004,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
Sonoma,06097154308,1403.0,060971543081015,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000001,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000002,0,0,0,0,0,0,0,0,0
Sonoma,06097990100,,060979901000003,0,0,0,0,0,0,0,0,0


## Group quarter population

In [69]:
block_hhpop = p15_labeled['Total:'].sum(level=['COUNTY','TRACT','zone_id','GEOID20'])

In [70]:
block_totpop = p12_labeled['Total:'].sum(level=['COUNTY','TRACT','zone_id','GEOID20'])

In [71]:
block_gqpop = block_totpop - block_hhpop

In [72]:
block_gqpop.sum(level='COUNTY')

COUNTY
Alameda          53805
Contra Costa     11255
Marin             7691
Napa              5172
San Francisco    27892
San Mateo         9352
Santa Clara      39607
Solano           11137
Sonoma            8866
Name: Total:, dtype: int64

# Sandbox

Check data files for which segments actually contain data for the Bay Area, and for which variables.

In [60]:
def check_segment_files():
    """
    Checks a list of segment files from the Census Bureau's Census 2020 Demographics and Housing Characteristics Files
    to determine if they contain census block level data for the San Francisco Bay Area.
    
    Returns:
        segments_with_block_level_data (list): A list of segment numbers that contain block level data.
        segments_with_no_block_level_data (list): A list of segment numbers that do not contain block level data.
    """
    segments_with_block_level_data = []
    segments_with_no_block_level_data = []
    segments = list(range(1, 45))
    segments.remove(15)

    for r in segments:
        print(r)
        segment_data = segment_loader(segment=r)

        print('\t', segment_data.shape)
        if segment_data.shape[0] == 0:
            segments_with_no_block_level_data.append(r)
        else:
            segments_with_block_level_data.append(r)

    return segments_with_block_level_data, segments_with_no_block_level_data
segments_with_block_level_data, segments_with_no_block_level_data = check_segment_files()

1
	 (82210, 239)
2
	 (82210, 254)
3
	 (82210, 144)
4
	 (0, 66)
5
	 (82210, 204)
6
	 (82210, 247)
7
	 (82210, 250)
8
	 (82210, 250)
9
	 (82210, 250)
10
	 (82210, 250)
11
	 (82210, 250)
12
	 (82210, 250)
13
	 (82210, 255)
14
	 (82210, 216)
16
	 (0, 240)
17
	 (0, 94)
18
	 (0, 243)
19
	 (0, 237)
20
	 (0, 191)
21
	 (0, 214)
22
	 (0, 214)
23
	 (0, 214)
24
	 (0, 214)
25
	 (0, 214)
26
	 (0, 214)
27
	 (0, 214)
28
	 (0, 214)
29
	 (0, 214)
30
	 (0, 214)
31
	 (0, 214)
32
	 (0, 214)
33
	 (0, 214)
34
	 (0, 214)
35
	 (0, 214)
36
	 (0, 214)
37
	 (0, 250)
38
	 (0, 253)
39
	 (0, 245)
40
	 (0, 224)
41
	 (0, 194)
42
	 (0, 194)
43
	 (0, 68)
44
	 (0, 200)


In [61]:
table_segment_map[table_segment_map.SEGMENT_NUMBER.isin(segments_with_block_level_data)]

Unnamed: 0,TABLE_ID,SEGMENT_NUMBER,TOTAL_RECORDS,TABLE_SORT_ORDER
0,H1,1,1,1
1,H2,1,4,2
2,H3,1,3,3
3,H4,1,4,4
4,H4A,1,4,5
...,...,...,...,...
135,P16S,14,9,17
136,P16T,14,9,18
137,P16U,14,9,19
138,P16V,14,9,20


In [62]:
table_matrix[table_matrix.Segment.isin(segments_with_block_level_data)].TABLE_NAME.unique()

array(['TOTAL POPULATION [1]', 'URBAN AND RURAL [4]', 'RACE [8]',
       'HISPANIC OR LATINO ORIGIN [3]',
       'HISPANIC OR LATINO ORIGIN BY RACE [17]',
       'RACE (TOTAL RACES TALLIED) [7]',
       'HISPANIC OR LATINO ORIGIN BY RACE (TOTAL RACES TALLIED) [15]',
       'RACE [71]',
       'HISPANIC OR LATINO, AND NOT HISPANIC OR LATINO BY RACE [73]',
       'RACE FOR THE POPULATION 18 YEARS AND OVER [71]',
       'HISPANIC OR LATINO, AND NOT HISPANIC OR LATINO BY RACE FOR THE POPULATION 18 YEARS AND OVER [73]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES [49]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES (WHITE ALONE) [49]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES (BLACK OR AFRICAN AMERICAN ALONE) [49]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES (AMERICAN INDIAN AND ALASKA NATIVE ALONE) [49]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES (ASIAN ALONE) [49]',
       'SEX BY AGE FOR SELECTED AGE CATEGORIES (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE) [49]',
