In [None]:
# See https://api.census.gov/data/2020/dec/ddhca/variables.html for an HTML version of the tables

# Need DHC



In [None]:
import json, os, re, requests
from collections import defaultdict
from census_utils import canonicalize_census_table_name, canonicalize_census_column_name

In [None]:
dataset_name = "census2020_block2020"
# TODO: what happens when we get multiple updates beyond just DHC?
# Do we loop?  Is there any conflict between them (we do see this already for PL vs DHC)


url = "https://api.census.gov/data/2020/dec/dhc/variables.json"


variables_response = requests.get(url).json()
variables = variables_response['variables']


In [None]:
print(f"{len(variables)} variables before filtering")

tables = defaultdict(list)
nvariables = 0
for variable, info in variables.items():
    groups = re.match(r"^([A-Z]+\d+[A-Z]*)_(\d+[A-Z]*)$", variable)
    if groups:
        table, column = groups.groups()
        tables[table].append(column)
        nvariables += 1

print(f"{nvariables} variables in {len(tables)} tables")

table_descs = []
for table_name in sorted(tables.keys()):
    table_desc = {"name": canonicalize_census_table_name(table_name), "description": None, "columns": []}
    for column_name in sorted(tables[table_name]):
        api_info = variables[f"{table_name}_{column_name}"]
        concept = api_info["concept"]
        if table_desc["description"]:
            assert table_desc["description"] == concept
        else:
            table_desc["description"] = concept

        reformatted_label = api_info["label"].replace(":!!", " &mdash; ")
        reformatted_label = re.sub(r"^\s*!!\s*", "", reformatted_label)
        # Remove trailing 'N' from column names
        assert column_name[-1:].lower() == 'n'
        column_name = column_name[:-1]

        table_desc["columns"].append([canonicalize_census_column_name(column_name), reformatted_label])
    table_descs.append(table_desc)

description_json = {
    "name": dataset_name,
    "tables": table_descs,
}

dest_path = f"generated_description_jsons/{dataset_name}/description.json"
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
json.dump(description_json, open(dest_path, "w"), indent=2)
print(f"Created {dest_path}")

In [None]:
# Type this in a shell to allow entering password:
# rsync -av --keep-dirlinks generated_description_jsons/ hal15.andrew.cmu.edu:uwsgi/dotmaptiles-data/data-visualization-tools/examples/lodes/columncache