# Inspect Overture buildings GeoParquet files

This notebook targets the `theme=buildings/type=building` release folder, reports schema details, and previews a few rows from the GeoParquet files using DuckDB. All files share the same structure, so a single sample per directory is sufficient.

In [4]:
from pathlib import Path
from collections import defaultdict

import duckdb
from IPython.display import display

repo_root = Path.cwd().resolve().parents[1]
base_path = repo_root / 'gis_data' / 'overturemaps-us-west-2' / 'release' / '2025-08-20.1' / 'theme=buildings' / 'type=building'
print(f'Base directory: {base_path}')

parquet_files = sorted(base_path.rglob('*.parquet'))
if not parquet_files:
    raise FileNotFoundError('No GeoParquet files found in theme=buildings/type=building.')

files_by_dir = defaultdict(list)
for file_path in parquet_files:
    files_by_dir[file_path.parent].append(file_path)

print(f'Found {len(parquet_files)} parquet files across {len(files_by_dir)} directories.')
for directory, files in sorted(files_by_dir.items()):
    rel_dir = directory.relative_to(base_path)
    print(f'{rel_dir}: {len(files)} file(s)')


Base directory: /workspace/gis_data/overturemaps-us-west-2/release/2025-08-20.1/theme=buildings/type=building
Found 237 parquet files across 1 directories.
.: 237 file(s)


In [5]:
import json
from pyarrow import parquet as pq

sample_file_for_metadata = parquet_files[0]
print(f"Inspecting metadata for: {sample_file_for_metadata.name}")
pq_file = pq.ParquetFile(sample_file_for_metadata)
key_value_metadata = pq_file.metadata.metadata or {}

def _decode_if_bytes(value):
    return value.decode('utf-8', 'replace') if isinstance(value, (bytes, bytearray)) else value

decoded_metadata = {
    _decode_if_bytes(k): _decode_if_bytes(v)
    for k, v in key_value_metadata.items()
}
print(json.dumps(decoded_metadata, indent=2))

geo_metadata = decoded_metadata.get('geo')
if geo_metadata:
    try:
        geo_json = json.loads(geo_metadata)
        primary_column = geo_json.get('primary_column')
        columns = geo_json.get('columns', {})
        geometry_info = columns.get(primary_column, {}) if columns else {}
        bbox = geometry_info.get('bbox')
        if bbox:
            print('Geometry bounding box:', bbox)
    except json.JSONDecodeError:
        print('Unable to parse geo metadata as JSON.')


Inspecting metadata for: part-00000-c8f91a12-b93f-4285-9103-a7779634c4ce-c000.zstd.parquet
{
  "org.apache.spark.sql.parquet.row.metadata": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geometry\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bbox\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"xmin\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"xmax\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ymin\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ymax\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"version\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sources\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"property\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataset\",\"type\"

In [6]:
con = duckdb.connect(database=':memory:')

for directory, files in sorted(files_by_dir.items()):
    sample_file = files[0]
    rel_dir = directory.relative_to(base_path)
    rel_label = '.' if str(rel_dir) == '.' else str(rel_dir)
    print(f"\n=== {rel_label} ===")
    print(f"Sample file: {sample_file.name}")
    schema_df = con.execute(
        "DESCRIBE SELECT * FROM read_parquet(?)", [str(sample_file)]
    ).fetchdf()
    display(schema_df)
    sample_rows_df = con.execute(
        "SELECT * FROM read_parquet(?) LIMIT 5", [str(sample_file)]
    ).fetchdf()
    display(sample_rows_df)
    print('Sources column sample:')
    for idx, value in sample_rows_df['sources'].items():
        print(f"Row {idx}: {value}")



=== . ===
Sample file: part-00000-c8f91a12-b93f-4285-9103-a7779634c4ce-c000.zstd.parquet


Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,geometry,BLOB,YES,,,
2,bbox,"STRUCT(xmin FLOAT, xmax FLOAT, ymin FLOAT, yma...",YES,,,
3,version,INTEGER,YES,,,
4,sources,"STRUCT(property VARCHAR, dataset VARCHAR, reco...",YES,,,
5,level,INTEGER,YES,,,
6,subtype,VARCHAR,YES,,,
7,class,VARCHAR,YES,,,
8,height,DOUBLE,YES,,,
9,names,"STRUCT(""primary"" VARCHAR, common MAP(VARCHAR, ...",YES,,,


Unnamed: 0,id,geometry,bbox,version,sources,level,subtype,class,height,names,...,facade_color,facade_material,roof_material,roof_shape,roof_direction,roof_orientation,roof_color,roof_height,theme,type
0,68c64c74-0704-4030-8d79-bb30b20fd032,"[0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 20, 192, ...","{'xmin': -167.40013122558594, 'xmax': -167.399...",1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,,,,"{'primary': 'Marlene Weather Station', 'common...",...,,,,,,,,,buildings,building
1,76c4a544-9ad3-4da0-be7d-22892dcbeb58,"[0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 6, 192, 1...","{'xmin': -179.96853637695312, 'xmax': -179.963...",1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,religious,,,"{'primary': 'St. Paul's Burgers', 'common': No...",...,,,,,,,,,buildings,building
2,34adb9da-7e11-4cc9-b5a6-53ae5518188b,"[0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 5, 192, 9...","{'xmin': -138.461181640625, 'xmax': -138.45275...",1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,,,,,...,,,,,,,,,buildings,building
3,4fe39e6c-ca77-4b7a-84ac-95deb3f7eed4,"[0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 5, 192, 9...","{'xmin': -136.80308532714844, 'xmax': -136.801...",1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,,,,,...,,,,,,,,,buildings,building
4,0f5f2fa8-7810-4a97-87b4-6d3021bee377,"[0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 5, 192, 9...","{'xmin': -136.8033447265625, 'xmax': -136.8023...",1,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,,,,,...,,,,,,,,,buildings,building


Sources column sample:
Row 0: [{'property': '', 'dataset': 'OpenStreetMap', 'record_id': 'w325830447@3', 'update_time': '2015-02-01T01:46:06.000Z', 'confidence': None, 'between': None}]
Row 1: [{'property': '', 'dataset': 'OpenStreetMap', 'record_id': 'w1417351285@1', 'update_time': '2025-07-25T18:07:43.000Z', 'confidence': None, 'between': None}]
Row 2: [{'property': '', 'dataset': 'OpenStreetMap', 'record_id': 'w1380639824@1', 'update_time': '2025-04-24T10:41:55.000Z', 'confidence': None, 'between': None}]
Row 3: [{'property': '', 'dataset': 'OpenStreetMap', 'record_id': 'w1250098696@1', 'update_time': '2024-02-14T01:23:09.000Z', 'confidence': None, 'between': None}]
Row 4: [{'property': '', 'dataset': 'OpenStreetMap', 'record_id': 'w1250098698@1', 'update_time': '2024-02-14T01:23:09.000Z', 'confidence': None, 'between': None}]
