In [None]:
import io, glob, json, os, re, requests, subprocess, zipfile
import pandas as pd
from functools import cache
from psql_utils.nhgis_api import NhgisApi, DatasetExtractSpec
from utils.utils import SimpleThreadPoolExecutor
@cache
def api():
    return NhgisApi()


## Get info for all datasets from 1990 Census

In [None]:
datasets = api().get_datasets_metadata()
selected_datasets = datasets[datasets['group'] == '1990 Census'].copy()

ncols = []
geoms = []
    
for i in range(len(selected_datasets)):
    metadata, tables_df = api().get_dataset_metadata(selected_datasets["name"].iloc[i])
    ncols.append(len(tables_df))
    geoms.append([level["name"] for level in metadata["geogLevels"]])

selected_datasets["ncols"] = ncols
selected_datasets["geoms"] = geoms

# show full width of columns
pd.set_option('display.max_colwidth', None)

selected_datasets

In [None]:
nhgis_data_downloads = "nhgis_data_downloads"

def download_table(dataset_name: str, table_name: str, geographic_level: str):
    data_dir = f"{nhgis_data_downloads}/{dataset_name}/{table_name}_{geographic_level}"
    os.makedirs(data_dir, exist_ok=True)
    extract_requests_path = f"{data_dir}/extract_requests.json"

    if not os.path.exists(extract_requests_path):
        extract_numbers: list[int] = []
        dataset_extract_spec = DatasetExtractSpec(dataset_name, [table_name], [geographic_level])
        extract_numbers = [api().request_extract(datasets=[dataset_extract_spec])]
        open(extract_requests_path, "w").write(json.dumps(extract_numbers) + "\n")
    extract_numbers = json.load(open(extract_requests_path))
    already_done = 0
    already_downloaded = 0
    for extract_number in extract_numbers:
        extract_done_path = f"{data_dir}/extract_{extract_number}_done"
        if os.path.exists(extract_done_path):
            already_done += 1
            continue
        extract_dir = f"{data_dir}/extract_{extract_number}"
        if os.path.exists(extract_dir):
            already_downloaded += 1
        else:
            api().download_extract(extract_number, extract_dir)
    if already_done:
        print(f"{table_name}: {already_done} extracts already complete")
    if already_downloaded:
        print(f"{table_name}: {already_downloaded} extracts already downloaded")


def download_dataset(dataset_name: str):
    geo_level = "block"
    print(f"download_dataset({dataset_name})")

    metadata, tables_df = api().get_dataset_metadata(dataset_name)
    pool = SimpleThreadPoolExecutor(max_workers=4)
    for table_name in tables_df["name"]:
        pool.submit(download_table, dataset_name, table_name, geo_level)
    pool.shutdown()

def download_datasets(dataset_names: list[str]):
    for dataset_name in dataset_names:
        download_dataset(dataset_name)

download_datasets(["1990_STF1"])

#download_datasets(selected_datasets["name"])