In [447]:
import pandas as pd
import numpy as np
import requests
from functools import lru_cache
import hashlib

In [448]:
pd.options.mode.chained_assignment = None

In [449]:
k = "f7217cc1222e7cecb49db88b036d517681dd703b"

In [450]:
@lru_cache(maxsize=None)
def census_tract_api(year, profile, state_id, k):
    url = f"https://api.census.gov/data/{year}/acs/acs5/profile?get=group({profile})&for=tract:*&in=state:{state_id}&key={k}"

    # request url for provided variable code & year
    get_response = requests.get(url)

    # convert api return as into json
    json_return = get_response.json()

    # convert json to pandas data frame
    df_tract = pd.DataFrame(json_return)  
    
    return df_tract

In [451]:
@lru_cache(maxsize=None)
def census_var_names(var):
    url = f"https://api.census.gov/data/2020/acs/acs5/profile/variables/{var}.json"
    
    # request url for provided variable code
    get_response = requests.get(url) 

    # convert api return as into json
    json_return = get_response.json()

    # extract the value for "label" 
    var_label = json_return["label"]

    return var_label

def apply_var_cols(df):
    # transpose first row containing all variable codes
    df_vars = df.iloc[:1, :].T.rename(columns={0:"variable"})

    # columns to keep
    col_keep = list(df_vars.variable[-5:])

    # iterate through each variable code and get the return api label
    var_labels = [ census_var_names(var) for var in df_vars.variable[:-5] ]

    # combine columns
    new_cols = var_labels + col_keep

    # apply column names
    df.columns = new_cols

    return df

In [452]:
# call census api using function
df_api_return = census_tract_api(2020, "DP02", "19", k)

In [453]:
# df_api_return.head()

In [454]:
df_census = apply_var_cols(df_api_return)

In [455]:
df_census.head()

Unnamed: 0,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!Total households,Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Percent!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Percent!!HOUSEHOLDS BY TYPE!!Total households,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Percent Margin of Error!!HOUSEHOLDS BY TYPE!!Total households,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household,...,Annotation of Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Percent!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Annotation of Percent!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,Annotation of Percent Margin of Error!!COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription,GEO_ID,NAME,state,county,tract
0,DP02_0001E,DP02_0001EA,DP02_0001M,DP02_0001MA,DP02_0001PE,DP02_0001PEA,DP02_0001PM,DP02_0001PMA,DP02_0002E,DP02_0002EA,...,DP02_0154MA,DP02_0154PE,DP02_0154PEA,DP02_0154PM,DP02_0154PMA,GEO_ID,NAME,state,county,tract
1,1891,,160,,1891,,-888888888,(X),1232,,...,,88.9,,4.6,,1400000US19155031601,"Census Tract 316.01, Pottawattamie County, Iowa",19,155,031601
2,1192,,197,,1192,,-888888888,(X),824,,...,,78.4,,8.0,,1400000US19155031602,"Census Tract 316.02, Pottawattamie County, Iowa",19,155,031602
3,1587,,161,,1587,,-888888888,(X),1021,,...,,86.1,,5.4,,1400000US19155031700,"Census Tract 317, Pottawattamie County, Iowa",19,155,031700
4,1339,,149,,1339,,-888888888,(X),703,,...,,82.2,,6.0,,1400000US19155031800,"Census Tract 318, Pottawattamie County, Iowa",19,155,031800


In [456]:
df_variable_codes = df_census.iloc[:1, :-5].T.reset_index().rename({"index": "variable", 0: "code"}, axis = 1)

df_variable_codes = df_variable_codes[["code", "variable"]]

measurement = []
demographic_target = []
demographic = []

for var in df_variable_codes.variable:
    var_split = var.split("!!")

    if len(var_split) == 0:
        continue

    col_measure = var_split[0]

    measurement.append(col_measure.lower()) # measurement value
    demographic_target.append(var_split[1].lower()) # demographic target
    
    if len(var_split[2:]) > 1:
        demographic.append(" ".join(map(str, var_split[2:])).lower()) # demographic
    else: 
        demographic.append(var_split[2].lower()) # demographic

df_variable_codes["measurement"] = measurement
df_variable_codes["demographic_target"] = demographic_target
df_variable_codes["demographic"] = demographic

In [457]:
df_variable_codes.head()

Unnamed: 0,code,variable,measurement,demographic_target,demographic
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,estimate,households by type,total households
1,DP02_0001EA,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,annotation of estimate,households by type,total households
2,DP02_0001M,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,margin of error,households by type,total households
3,DP02_0001MA,Annotation of Margin of Error!!HOUSEHOLDS BY T...,annotation of margin of error,households by type,total households
4,DP02_0001PE,Percent!!HOUSEHOLDS BY TYPE!!Total households,percent,households by type,total households


In [458]:
# replace header with first row
df_census.columns = df_census.iloc[0]

df_census = df_census[1:]

# re order headers
df_census = df_census[ df_census.columns.tolist()[-5:] + df_census.columns.tolist()[:-5] ]

In [459]:
# create geo table

df_census_geo = df_census.iloc[:, :5]

df_census_geo.head()

Unnamed: 0,GEO_ID,NAME,state,county,tract
1,1400000US19155031601,"Census Tract 316.01, Pottawattamie County, Iowa",19,155,31601
2,1400000US19155031602,"Census Tract 316.02, Pottawattamie County, Iowa",19,155,31602
3,1400000US19155031700,"Census Tract 317, Pottawattamie County, Iowa",19,155,31700
4,1400000US19155031800,"Census Tract 318, Pottawattamie County, Iowa",19,155,31800
5,1400000US19155031900,"Census Tract 319, Pottawattamie County, Iowa",19,155,31900


In [460]:
# create demographic target ids

demographic_target_ids = [ hashlib.md5(demo_target.encode("utf-8")).hexdigest() for demo_target in df_variable_codes.demographic_target ]

df_variable_codes["demographic_target_id"] = demographic_target_ids

In [467]:
# create id for every demographic in a demographic target
demographic_ids = []
for demo_target, demo in zip(df_variable_codes["demographic_target"], df_variable_codes["demographic"]):
    demo_demo_target = demo_target + " " + demo
    demographic_id = hashlib.md5(demo_demo_target.encode("utf-8")).hexdigest()
    demographic_ids.append(demographic_id)

df_variable_codes["demographic_id"] = demographic_ids

In [472]:
df_variable_codes.head()

Unnamed: 0,code,variable,measurement,demographic_target,demographic,demographic_target_id,demographic_id
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
1,DP02_0001EA,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,annotation of estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
2,DP02_0001M,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
3,DP02_0001MA,Annotation of Margin of Error!!HOUSEHOLDS BY T...,annotation of margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
4,DP02_0001PE,Percent!!HOUSEHOLDS BY TYPE!!Total households,percent,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8


In [469]:
# create dataframe for each demographic target

# break out demographic target by id
demo_target_df_dict = {demo_target: df_variable_codes[df_variable_codes["demographic_target_id"] == demo_target] for demo_target in df_variable_codes.demographic_target_id.unique()}

# append geo id to each demographic target
demo_target_geo_df_dict = dict()

for demo_target_id, demo_target_df in demo_target_df_dict.items():
    demo_target_codes = demo_target_df.code.unique().tolist()
    demo_target_geo_df_dict[demo_target_id] = df_census[ [df_census_geo.columns[0]] + demo_target_codes ]

In [486]:
# demo_target_test = demo_target_df_dict["43551e2203882bcfa813b5a1f84408c3"]

# demo_target_test = demo_target_test[demo_target_test["measurement"] == "estimate"]

# demo_target_test

# break out demographic target by measurement
measurement_df_dict = {measurement: df_variable_codes[df_variable_codes["measurement"] == measurement] for measurement in df_variable_codes.measurement.unique()}

# append geo id to each measurement
measurement_geo_df_dict = dict()

for measurement, measurement_df in measurement_df_dict.items():
    variable_codes = measurement_df.code.unique().tolist()
    measurement_geo_df_dict[measurement] = df_census[ [df_census_geo.columns[0]] + variable_codes ]

In [492]:
estimate_variables = measurement_df_dict["estimate"]
estimate_anotations = measurement_df_dict["annotation of estimate"]
margin_of_errore_variables = measurement_df_dict["margin of error"]
margin_of_error_annotation = measurement_df_dict["annotation of margin of error"]
percent_variables = measurement_df_dict["percent"]
percent_annotations = measurement_df_dict["annotation of percent"]
percent_margin_of_error_variables = measurement_df_dict["percent margin of error"]
percent_margin_of_error_annotations = measurement_df_dict["annotation of percent margin of error"]

estimate_variables

Unnamed: 0,code,variable,measurement,demographic_target,demographic,demographic_target_id,demographic_id
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
8,DP02_0002E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,estimate,households by type,total households married-couple household,43551e2203882bcfa813b5a1f84408c3,729132cbd2d86abb56276288c1fa7689
16,DP02_0003E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,estimate,households by type,total households married-couple household with...,43551e2203882bcfa813b5a1f84408c3,49dd32656aec4e3e107ed6cb07b7e1a7
24,DP02_0004E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,estimate,households by type,total households cohabiting couple household,43551e2203882bcfa813b5a1f84408c3,ab9838ecf4da5ffc6a7e87823da3cfc5
32,DP02_0005E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,estimate,households by type,total households cohabiting couple household w...,43551e2203882bcfa813b5a1f84408c3,828e831e49e496641fc8711765c58e38
...,...,...,...,...,...,...,...
1192,DP02_0150E,Estimate!!ANCESTRY!!Total population!!Welsh,estimate,ancestry,total population welsh,fcec51e8d355c65c71c8e4ebce8699bb,cd51c946f47fbafd9ed9e5a6695d68df
1200,DP02_0151E,Estimate!!ANCESTRY!!Total population!!West Ind...,estimate,ancestry,total population west indian (excluding hispan...,fcec51e8d355c65c71c8e4ebce8699bb,db29291bc95ddc5a4c5129ea22147ef6
1208,DP02_0152E,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,estimate,computers and internet use,total households,bceb3c1f81e05c551b04247cbbf4998e,e589eadf35554fe19e286d6f44ccddd1
1216,DP02_0153E,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,estimate,computers and internet use,total households with a computer,bceb3c1f81e05c551b04247cbbf4998e,76e578c8c1b6f25ae4b11331c19834ab


In [488]:
measurement_geo_df_dict["estimate"]

Unnamed: 0,GEO_ID,DP02_0001E,DP02_0002E,DP02_0003E,DP02_0004E,DP02_0005E,DP02_0006E,DP02_0007E,DP02_0008E,DP02_0009E,...,DP02_0145E,DP02_0146E,DP02_0147E,DP02_0148E,DP02_0149E,DP02_0150E,DP02_0151E,DP02_0152E,DP02_0153E,DP02_0154E
1,1400000US19155031601,1891,1232,-888888888,132,-888888888,189,-888888888,146,90,...,0,0,85,7,0,21,0,1891,1737,1681
2,1400000US19155031602,1192,824,-888888888,45,-888888888,99,-888888888,61,43,...,0,9,88,0,0,36,0,1192,1075,934
3,1400000US19155031700,1587,1021,-888888888,85,-888888888,144,-888888888,87,24,...,0,55,124,22,0,0,0,1587,1455,1367
4,1400000US19155031800,1339,703,-888888888,13,-888888888,276,-888888888,228,58,...,0,0,26,11,0,25,0,1339,1200,1100
5,1400000US19155031900,1372,843,-888888888,122,-888888888,112,-888888888,91,19,...,0,0,103,0,0,14,0,1372,1308,1236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,1400000US19155030900,941,78,-888888888,60,-888888888,349,-888888888,328,95,...,0,0,13,0,0,9,0,941,678,494
893,1400000US19155031000,935,367,-888888888,100,-888888888,166,-888888888,92,38,...,0,0,26,5,0,45,0,935,824,740
894,1400000US19155031100,1084,405,-888888888,74,-888888888,314,-888888888,258,59,...,0,0,61,0,0,25,0,1084,1049,903
895,1400000US19155031200,1314,560,-888888888,49,-888888888,212,-888888888,180,69,...,0,11,108,17,0,0,0,1314,1239,1179


In [485]:
df_census_geo

Unnamed: 0,GEO_ID,NAME,state,county,tract
1,1400000US19155031601,"Census Tract 316.01, Pottawattamie County, Iowa",19,155,031601
2,1400000US19155031602,"Census Tract 316.02, Pottawattamie County, Iowa",19,155,031602
3,1400000US19155031700,"Census Tract 317, Pottawattamie County, Iowa",19,155,031700
4,1400000US19155031800,"Census Tract 318, Pottawattamie County, Iowa",19,155,031800
5,1400000US19155031900,"Census Tract 319, Pottawattamie County, Iowa",19,155,031900
...,...,...,...,...,...
892,1400000US19155030900,"Census Tract 309, Pottawattamie County, Iowa",19,155,030900
893,1400000US19155031000,"Census Tract 310, Pottawattamie County, Iowa",19,155,031000
894,1400000US19155031100,"Census Tract 311, Pottawattamie County, Iowa",19,155,031100
895,1400000US19155031200,"Census Tract 312, Pottawattamie County, Iowa",19,155,031200


In [474]:
demo_target_geo_df_dict["43551e2203882bcfa813b5a1f84408c3"]

Unnamed: 0,GEO_ID,DP02_0001E,DP02_0001EA,DP02_0001M,DP02_0001MA,DP02_0001PE,DP02_0001PEA,DP02_0001PM,DP02_0001PMA,DP02_0002E,...,DP02_0016PM,DP02_0016PMA,DP02_0017E,DP02_0017EA,DP02_0017M,DP02_0017MA,DP02_0017PE,DP02_0017PEA,DP02_0017PM,DP02_0017PMA
1,1400000US19155031601,1891,,160,,1891,,-888888888,(X),1232,...,-888888888,(X),3.19,,0.20,,-888888888,(X),-888888888,(X)
2,1400000US19155031602,1192,,197,,1192,,-888888888,(X),824,...,-888888888,(X),2.49,,0.20,,-888888888,(X),-888888888,(X)
3,1400000US19155031700,1587,,161,,1587,,-888888888,(X),1021,...,-888888888,(X),3.21,,0.27,,-888888888,(X),-888888888,(X)
4,1400000US19155031800,1339,,149,,1339,,-888888888,(X),703,...,-888888888,(X),2.93,,0.24,,-888888888,(X),-888888888,(X)
5,1400000US19155031900,1372,,117,,1372,,-888888888,(X),843,...,-888888888,(X),3.20,,0.25,,-888888888,(X),-888888888,(X)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,1400000US19155030900,941,,143,,941,,-888888888,(X),78,...,-888888888,(X),3.26,,0.68,,-888888888,(X),-888888888,(X)
893,1400000US19155031000,935,,108,,935,,-888888888,(X),367,...,-888888888,(X),2.69,,0.24,,-888888888,(X),-888888888,(X)
894,1400000US19155031100,1084,,177,,1084,,-888888888,(X),405,...,-888888888,(X),3.13,,0.27,,-888888888,(X),-888888888,(X)
895,1400000US19155031200,1314,,114,,1314,,-888888888,(X),560,...,-888888888,(X),3.35,,0.25,,-888888888,(X),-888888888,(X)


In [463]:
# create id for every demographic in a demographic target
for demo_target_id, demo_target_df in demo_target_df_dict.items():
    demographic_id_lst = []
    for demo, demo_target in zip(demo_target_df.demographic, demo_target_df.demographic_target):
        demo_demo_target = demo_target + " " + demo
        demographic_id = hashlib.md5(demo_demo_target.encode("utf-8")).hexdigest()
        demographic_id_lst.append(demographic_id)
    demo_target_df["demographic_id"] = demographic_id_lst

In [464]:
demo_target_df_dict["43551e2203882bcfa813b5a1f84408c3"].head()

Unnamed: 0,code,variable,measurement,demographic_target,demographic,demographic_target_id,demographic_id
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
1,DP02_0001EA,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,annotation of estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
2,DP02_0001M,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
3,DP02_0001MA,Annotation of Margin of Error!!HOUSEHOLDS BY T...,annotation of margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8
4,DP02_0001PE,Percent!!HOUSEHOLDS BY TYPE!!Total households,percent,households by type,total households,43551e2203882bcfa813b5a1f84408c3,44e920fd50fbc189b315aa67243c13f8


In [465]:
df_variable_codes

Unnamed: 0,code,variable,measurement,demographic_target,demographic,demographic_target_id
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3
1,DP02_0001EA,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,annotation of estimate,households by type,total households,43551e2203882bcfa813b5a1f84408c3
2,DP02_0001M,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3
3,DP02_0001MA,Annotation of Margin of Error!!HOUSEHOLDS BY T...,annotation of margin of error,households by type,total households,43551e2203882bcfa813b5a1f84408c3
4,DP02_0001PE,Percent!!HOUSEHOLDS BY TYPE!!Total households,percent,households by type,total households,43551e2203882bcfa813b5a1f84408c3
...,...,...,...,...,...,...
1227,DP02_0154MA,Annotation of Margin of Error!!COMPUTERS AND I...,annotation of margin of error,computers and internet use,total households with a broadband internet sub...,bceb3c1f81e05c551b04247cbbf4998e
1228,DP02_0154PE,Percent!!COMPUTERS AND INTERNET USE!!Total hou...,percent,computers and internet use,total households with a broadband internet sub...,bceb3c1f81e05c551b04247cbbf4998e
1229,DP02_0154PEA,Annotation of Percent!!COMPUTERS AND INTERNET ...,annotation of percent,computers and internet use,total households with a broadband internet sub...,bceb3c1f81e05c551b04247cbbf4998e
1230,DP02_0154PM,Percent Margin of Error!!COMPUTERS AND INTERNE...,percent margin of error,computers and internet use,total households with a broadband internet sub...,bceb3c1f81e05c551b04247cbbf4998e


In [424]:
# create reference table for variable codes

test_df = demo_target_df_dict["43551e2203882bcfa813b5a1f84408c3"].T.reset_index()

# replace header with first row
test_df.columns = test_df.iloc[0]

test_df = test_df[1:]

test_df = test_df.reset_index().rename({"code": "type"}, axis=1).drop("index", axis=1)

test_df #.columns #.reset_index()

Unnamed: 0,type,DP02_0001E,DP02_0001EA,DP02_0001M,DP02_0001MA,DP02_0001PE,DP02_0001PEA,DP02_0001PM,DP02_0001PMA,DP02_0002E,...,DP02_0016PM,DP02_0016PMA,DP02_0017E,DP02_0017EA,DP02_0017M,DP02_0017MA,DP02_0017PE,DP02_0017PEA,DP02_0017PM,DP02_0017PMA
0,variable,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Annotation of Margin of Error!!HOUSEHOLDS BY T...,Percent!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Percent!!HOUSEHOLDS BY TYPE!!Tot...,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!T...,Annotation of Percent Margin of Error!!HOUSEHO...,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,...,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!T...,Annotation of Percent Margin of Error!!HOUSEHO...,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Annotation of Margin of Error!!HOUSEHOLDS BY T...,Percent!!HOUSEHOLDS BY TYPE!!Total households!...,Annotation of Percent!!HOUSEHOLDS BY TYPE!!Tot...,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!T...,Annotation of Percent Margin of Error!!HOUSEHO...
1,measurement,estimate,annotation of estimate,margin of error,annotation of margin of error,percent,annotation of percent,percent margin of error,annotation of percent margin of error,estimate,...,percent margin of error,annotation of percent margin of error,estimate,annotation of estimate,margin of error,annotation of margin of error,percent,annotation of percent,percent margin of error,annotation of percent margin of error
2,demographic_target,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,...,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type
3,demographic,total households,total households,total households,total households,total households,total households,total households,total households,total households married-couple household,...,total households average household size,total households average household size,total households average family size,total households average family size,total households average family size,total households average family size,total households average family size,total households average family size,total households average family size,total households average family size
4,demographic_target_id,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,...,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3
5,demographic_id,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,729132cbd2d86abb56276288c1fa7689,...,e29605bfd8eba5a37612b7fdfb24a3a3,e29605bfd8eba5a37612b7fdfb24a3a3,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96,6906a5f8bdeae1618a19aa12117dca96


In [445]:
variable_code_df = pd.concat([ demo_df for demo_df in demo_target_df_dict.values() ])

variable_code_df = variable_code_df.T.reset_index()

# replace header with first row
variable_code_df.columns = variable_code_df.iloc[0]

variable_code_df = variable_code_df[1:]

variable_code_df = variable_code_df.reset_index().rename({"code": "type"}, axis=1).drop("index", axis=1)

variable_code_df #.drop_duplicates()

# variable_code_df

Unnamed: 0,type,DP02_0001E,DP02_0001EA,DP02_0001M,DP02_0001MA,DP02_0001PE,DP02_0001PEA,DP02_0001PM,DP02_0001PMA,DP02_0002E,...,DP02_0153PM,DP02_0153PMA,DP02_0154E,DP02_0154EA,DP02_0154M,DP02_0154MA,DP02_0154PE,DP02_0154PEA,DP02_0154PM,DP02_0154PMA
0,variable,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Estimate!!HOUSEHOLDS BY TYPE!!To...,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,Annotation of Margin of Error!!HOUSEHOLDS BY T...,Percent!!HOUSEHOLDS BY TYPE!!Total households,Annotation of Percent!!HOUSEHOLDS BY TYPE!!Tot...,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!T...,Annotation of Percent Margin of Error!!HOUSEHO...,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,...,Percent Margin of Error!!COMPUTERS AND INTERNE...,Annotation of Percent Margin of Error!!COMPUTE...,Estimate!!COMPUTERS AND INTERNET USE!!Total ho...,Annotation of Estimate!!COMPUTERS AND INTERNET...,Margin of Error!!COMPUTERS AND INTERNET USE!!T...,Annotation of Margin of Error!!COMPUTERS AND I...,Percent!!COMPUTERS AND INTERNET USE!!Total hou...,Annotation of Percent!!COMPUTERS AND INTERNET ...,Percent Margin of Error!!COMPUTERS AND INTERNE...,Annotation of Percent Margin of Error!!COMPUTE...
1,measurement,estimate,annotation of estimate,margin of error,annotation of margin of error,percent,annotation of percent,percent margin of error,annotation of percent margin of error,estimate,...,percent margin of error,annotation of percent margin of error,estimate,annotation of estimate,margin of error,annotation of margin of error,percent,annotation of percent,percent margin of error,annotation of percent margin of error
2,demographic_target,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,households by type,...,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use,computers and internet use
3,demographic,total households,total households,total households,total households,total households,total households,total households,total households,total households married-couple household,...,total households with a computer,total households with a computer,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...,total households with a broadband internet sub...
4,demographic_target_id,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,43551e2203882bcfa813b5a1f84408c3,...,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e,bceb3c1f81e05c551b04247cbbf4998e
5,demographic_id,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,44e920fd50fbc189b315aa67243c13f8,729132cbd2d86abb56276288c1fa7689,...,76e578c8c1b6f25ae4b11331c19834ab,76e578c8c1b6f25ae4b11331c19834ab,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6,2c8ba9d62282dd5f4a52a3e39244e6b6


In [367]:
demo_target_geo_df_dict["43551e2203882bcfa813b5a1f84408c3"].head()

Unnamed: 0,GEO_ID,DP02_0001E,DP02_0001EA,DP02_0001M,DP02_0001MA,DP02_0001PE,DP02_0001PEA,DP02_0001PM,DP02_0001PMA,DP02_0002E,...,DP02_0016PM,DP02_0016PMA,DP02_0017E,DP02_0017EA,DP02_0017M,DP02_0017MA,DP02_0017PE,DP02_0017PEA,DP02_0017PM,DP02_0017PMA
1,1400000US19155031601,1891,,160,,1891,,-888888888,(X),1232,...,-888888888,(X),3.19,,0.2,,-888888888,(X),-888888888,(X)
2,1400000US19155031602,1192,,197,,1192,,-888888888,(X),824,...,-888888888,(X),2.49,,0.2,,-888888888,(X),-888888888,(X)
3,1400000US19155031700,1587,,161,,1587,,-888888888,(X),1021,...,-888888888,(X),3.21,,0.27,,-888888888,(X),-888888888,(X)
4,1400000US19155031800,1339,,149,,1339,,-888888888,(X),703,...,-888888888,(X),2.93,,0.24,,-888888888,(X),-888888888,(X)
5,1400000US19155031900,1372,,117,,1372,,-888888888,(X),843,...,-888888888,(X),3.2,,0.25,,-888888888,(X),-888888888,(X)


In [201]:
df_census_test = df_census[["GEO_ID", "DP02_0001E"]]

df_census_test

Unnamed: 0,GEO_ID,DP02_0001E
1,1400000US19155031601,1891
2,1400000US19155031602,1192
3,1400000US19155031700,1587
4,1400000US19155031800,1339
5,1400000US19155031900,1372
...,...,...
892,1400000US19155030900,941
893,1400000US19155031000,935
894,1400000US19155031100,1084
895,1400000US19155031200,1314


In [203]:
print("demo_target:", len(df_variable_codes.demographic_target.unique()))
print("demo:", len(df_variable_codes.demographic.unique()))

demo_target: 17
demo: 152


In [205]:
df_variable_codes[df_variable_codes["demographic_target"] == "households by type"]

array(['total households', 'total households married-couple household',
       'total households married-couple household with children of the householder under 18 years',
       'total households cohabiting couple household',
       'total households cohabiting couple household with children of the householder under 18 years',
       'total households male householder, no spouse/partner present',
       'total households male householder, no spouse/partner present with children of the householder under 18 years',
       'total households male householder, no spouse/partner present householder living alone',
       'total households male householder, no spouse/partner present householder living alone 65 years and over',
       'total households female householder, no spouse/partner present',
       'total households female householder, no spouse/partner present with children of the householder under 18 years',
       'total households female householder, no spouse/partner present house

In [None]:
## create measurement df
# for every code: capture the measurement, demo target & demographic

measurement = set()
demographic_target = set()
demographic = set()

for i, col in zip(df_census.iloc[:, :-5].index, df_census.columns[:-5]):
    col_split = col.split("!!")
    if len(col_split) == 0:
        continue

    col_measure = col_split[0]

    measurement.add(col_measure.lower()) # measurement value
    demographic_target.add(col_split[1].lower()) # demographic target
    
    if len(col_split[2:]) > 1:
        demographic.add(" ".join(map(str, col_split[2:])).lower()) # demographic
    else: 
        demographic.add(col_split[2].lower()) # demographic

In [None]:
# replace header with first row
df_census.columns = df_census.iloc[0]

df_census = df_census[1:]

# re order headers
df_census = df_census[ df_census.columns.tolist()[-5:] + df_census.columns.tolist()[:-5] ]

In [13]:
# create categories from column names
category_dict = {"measurement":[], "profile":[], "population":[], "type":[]}

# split columns names on "!!"
# measurement = 
# profile = 
# population =
# type =

# iterate through columns created from a variable code
for c in df_new.columns[:-5]:
    col_split = c.split("!!")
    category_dict["measurement"].append(col_split[0])
    category_dict["profile"].append(col_split[1])
    category_dict["population"].append(col_split[2])
    # category_dict["type"].append(" ".join(col_split[3:]))
    type_val = col_split[3:]
    if len(type_val) == 0:
        category_dict["type"].append(["Total"])
    else:
        category_dict["type"].append(col_split[3:])

In [14]:
# create unique IDs
def unique_id(dict, dict_k):
    col_dict = {}
    i = 1
    for val in set(dict[dict_k]):
        col_dict[i] = val
        i+=1
    return col_dict

In [15]:
measurement_dict = unique_id(category_dict, "measurement")
profile_dict = unique_id(category_dict, "profile")

In [16]:
# profile_dict

In [17]:
type_dict = {}
i = 1
for x in category_dict["type"]:
    if x in list(type_dict.values()):
        continue
    elif x == "total":
        type_dict[i] = x
        i+=1
    elif x[0] in type_dict.values():
        continue
    else:
       type_dict[i] = x[0]
       i+=1

# type_dict

In [18]:
# iterate through columns
# parse column names
# pair column to each dictionary based on values

### add this to the above loop under "# create categories from column names"

In [19]:
# find key from value
def get_key(var_dict, val):
    for k, v in var_dict.items():
        if v == val:
            return k