# Inspect Overture divisions GeoParquet files

This notebook scans the `theme=divisions` release folder for GeoParquet files, reports their schema, and shows a handful of sample rows using DuckDB. Each subdirectory is sampled once because the files within share the same structure.

In [1]:
from pathlib import Path
from collections import defaultdict

import duckdb
from IPython.display import display

repo_root = Path.cwd().resolve().parents[1]
base_path = repo_root / 'gis_data' / 'overturemaps-us-west-2' / 'release' / '2025-08-20.1' / 'theme=divisions'
print(f'Base directory: {base_path}')

parquet_files = sorted(base_path.rglob('*.parquet'))
if not parquet_files:
    raise FileNotFoundError('No GeoParquet files found under the divisions theme directory.')

files_by_dir = defaultdict(list)
for file_path in parquet_files:
    files_by_dir[file_path.parent].append(file_path)

print(f'Found {len(parquet_files)} parquet files across {len(files_by_dir)} directories.')
for directory, files in sorted(files_by_dir.items()):
    rel_dir = directory.relative_to(base_path)
    print(f'{rel_dir}: {len(files)} file(s)')


Base directory: /workspace/gis_data/overturemaps-us-west-2/release/2025-08-20.1/theme=divisions
Found 6 parquet files across 3 directories.
type=division: 1 file(s)
type=division_area: 4 file(s)
type=division_boundary: 1 file(s)


In [None]:
con = duckdb.connect(database=':memory:')

for directory, files in sorted(files_by_dir.items()):
    sample_file = files[0]
    rel_dir = directory.relative_to(base_path)
    print(f"\n=== {rel_dir} ===")
    print(f"Sample file: {sample_file.name}")
    schema_df = con.execute(
        "DESCRIBE SELECT * FROM read_parquet(?)", [str(sample_file)]
    ).fetchdf()
    display(schema_df)
    sample_rows_df = con.execute(
        "SELECT * FROM read_parquet(?) LIMIT 5", [str(sample_file)]
    ).fetchdf()
    display(sample_rows_df)


In [2]:
division_path = base_path / 'type=division'
country_pattern = str(division_path / '*.parquet')
results_path = (repo_root / 'data' / 'results')
results_path.mkdir(parents=True, exist_ok=True)
countries_output = results_path / 'countries.parquet'

con = duckdb.connect(database=':memory:')
country_df = con.execute(
    "SELECT * FROM read_parquet(?) WHERE subtype = 'country' ORDER BY id",
    [country_pattern]
).fetchdf()
print(f'Retrieved {len(country_df)} division records with subtype=country.')
country_df.to_parquet(countries_output, index=False)
print(f'Saved results to {countries_output}')
display(country_df.head())
con.close()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Retrieved 219 division records with subtype=country.
Saved results to /workspace/data/results/countries.parquet


Unnamed: 0,id,geometry,bbox,country,version,sources,cartography,subtype,class,names,...,perspectives,local_type,hierarchies,parent_division_id,norms,population,capital_division_ids,capital_of_divisions,theme,type
0,006a49e8-ea13-49f7-af64-8ba7d7851649,"[0, 0, 0, 0, 1, 64, 34, 204, 222, 227, 79, 198...","{'xmin': 9.400137901306152, 'xmax': 9.40013885...",TN,1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,country,,"{'primary': 'تونس', 'common': {'hy': 'Թունիս',...",...,,{'en': 'country'},[[{'division_id': '006a49e8-ea13-49f7-af64-8ba...,,{'driving_side': 'right'},,[af3a25f5-8c3e-40a4-9b9b-abfb37cac09e],,divisions,division
1,04b8a6ab-a5a1-45fe-a569-1d296454f583,"[0, 0, 0, 0, 1, 192, 46, 250, 231, 92, 155, 11...","{'xmin': -15.490046501159668, 'xmax': -15.4900...",GM,1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,country,,"{'primary': 'Gambia', 'common': {'hy': 'Գամբիա...",...,,{'en': 'country'},[[{'division_id': '04b8a6ab-a5a1-45fe-a569-1d2...,,{'driving_side': 'right'},,[8d8f5445-8488-4f94-a608-cfb40317c0db],,divisions,division
2,051da74f-6039-42fb-943f-3774707222d8,"[0, 0, 0, 0, 1, 64, 64, 27, 187, 113, 90, 182,...","{'xmin': 32.21665573120117, 'xmax': 32.2166595...",UG,1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,country,,"{'primary': 'Uganda', 'common': {'hy': 'Ուգանդ...",...,,{'en': 'country'},[[{'division_id': '051da74f-6039-42fb-943f-377...,,{'driving_side': 'left'},,[ae7f1dd6-21fc-4643-a5f6-38780b2b7ca2],,divisions,division
3,05661c9d-68f5-4a26-a653-05f6ef959b50,"[0, 0, 0, 0, 1, 64, 84, 255, 255, 214, 121, 24...","{'xmin': 83.9999771118164, 'xmax': 83.99999237...",NP,1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,country,,"{'primary': 'नेपाल', 'common': {'hy': 'Նեպալ',...",...,,{'en': 'country'},[[{'division_id': '05661c9d-68f5-4a26-a653-05f...,,{'driving_side': 'left'},,[80848502-b347-463c-a683-aeccdd7696c7],,divisions,division
4,08dcf896-627e-46e8-9439-3d905390b7c3,"[0, 0, 0, 0, 1, 64, 22, 137, 139, 227, 206, 11...","{'xmin': 5.634322643280029, 'xmax': 5.63432359...",NL,1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,country,,"{'primary': 'Nederland', 'common': {'hy': 'Նիդ...",...,,{'en': 'country'},[[{'division_id': '08dcf896-627e-46e8-9439-3d9...,,{'driving_side': 'right'},,[29be7bc9-5783-4240-87d6-fefa0a64e0b3],,divisions,division


In [3]:
import geopandas as gpd
import folium
from folium.features import GeoJsonTooltip
from IPython.display import display

countries_gdf = gpd.read_parquet(countries_output)
if countries_gdf.crs is None or countries_gdf.crs.to_epsg() != 4326:
    countries_gdf = countries_gdf.to_crs('EPSG:4326')

m = folium.Map(location=[0, 0], zoom_start=2)
folium.GeoJson(
    countries_gdf,
    tooltip=GeoJsonTooltip(fields=['country'], aliases=['Country:']),
).add_to(m)
display(m)


ValueError: Missing geo metadata in Parquet/Feather file.
            Use pandas.read_parquet/read_feather() instead.