In [None]:
import re
from functools import cache
from psql_utils.epsql import Engine
@cache
def engine():
    return Engine()




In [None]:
# for table in engine().list_tables("census"):
#     if table.startswith("dec2020"):
#         print(table)


In [None]:
from import_nhgis_shapefiles import nhgis_geo_table_name
from psql_utils.epsql import get_table_name

@cache
def get_geo_geoids(year: int | str, geo_level_id: str, basis_id: str|None = None):
    table_name = nhgis_geo_table_name(year, geo_level_id, basis_id)
    return sorted(engine().execute_returning_value(f"SELECT json_agg(geoid) FROM {table_name}"))

def parse_table_name(sql_table_name: str):
    match = re.match(r"[a-z]+(\d+)[a-z]+\w*_(\w+)_(\w+)", get_table_name(sql_table_name))
    if not match:
        raise ValueError(f"Invalid table name: {sql_table_name}")
    return dict(zip(["year", "table_name", "geo_level"], match.groups()))

def validate_census_table(sql_table_name: str):
    try:
        table_info = parse_table_name(sql_table_name)
    except:
        print(f"{sql_table_name}: Invalid table name")
        return
    census_geoids = sorted(engine().execute_returning_value(f"SELECT json_agg(geoid) FROM census.{sql_table_name}"))

    geo_geoids = get_geo_geoids(table_info["year"], table_info["geo_level"])
    #geo_geoids = sorted(engine().execute_returning_value(f"SELECT json_agg(geoid) FROM nhgis_geo_wgs84.tract_2020_tl2020"))
    census_only = sorted(set(census_geoids) - set(geo_geoids))
    # Filter out tracts starting with 99
    census_only = [geoid for geoid in census_only if not geoid[5:7] == "99"]
    geo_only = sorted(set(geo_geoids) - set(census_geoids))
    if geo_only:
        raise Exception(f"{sql_table_name}: {len(geo_only)} in geo only")
    if census_only:
        print(f"{sql_table_name}: {len(census_only)} in census only")
    else:
        print(f"{sql_table_name}: OK")

for table in sorted(engine().list_tables("census")):
    validate_census_table(table)
validate_census_table("dec2020pl_h1_block")
validate_census_table("dec2020pl_h1_blockgroup")
validate_census_table("dec2020pl_h1_tract")
validate_census_table("dec2020pl_h1_county")
#parse_table_name("dec2020pl_h1_tract")



In [None]:
# set pandas output to show all columns
import pandas as pd
pd.set_option('display.max_columns', None)


engine().execute_returning_df("SELECT * FROM census.acs2021acs5_b05006pr_state")

In [None]:
21/acs/acs5.B05006PR_001E st