# Embeddings TIF → H3 Conversion (Chunked)
Convert AlphaEarth Foundation Embeddings GeoTIFF to H3 parquet format.

**Uses chunked processing to avoid memory issues.**

**Change `COUNTY` below and run all cells.**

In [33]:
# SETTINGS
COUNTY = "maricopa"  # Options: los_angeles, napa, suffolk, maricopa
H3_RES = 9              # H3 resolution (~174m hexagons)
NUM_CHUNKS = 10         # Split into 10 chunks to save memory

import rasterio
import numpy as np
import pandas as pd
import h3
import glob
import os
from tqdm import tqdm
import gc

BASE = "/home/network-lab/Desktop/EWRI"
RAW = f"{BASE}/raw/{COUNTY}/embeddings_data"
PROCESSED = f"{BASE}/processed/{COUNTY}"
os.makedirs(PROCESSED, exist_ok=True)

print(f"County: {COUNTY.upper()}")
print(f"H3 Resolution: {H3_RES}")
print(f"Chunks: {NUM_CHUNKS}")

County: MARICOPA
H3 Resolution: 9
Chunks: 10


In [34]:
# LIST TIF FILES
tif_files = sorted(glob.glob(f"{RAW}/*.tif"))

# Prefer merged file if exists, else use all tiles
merged = [f for f in tif_files if "merged" in f.lower()]
tif_to_use = merged if merged else tif_files

print(f"Found {len(tif_files)} TIF file(s)")
print(f"Using {len(tif_to_use)} file(s):")
for f in tif_to_use:
    with rasterio.open(f) as src:
        print(f"  - {os.path.basename(f)}: {src.count} bands, {src.width}x{src.height}px")

Found 5 TIF file(s)
Using 5 file(s):
  - Maricopa_embeddings_100m_2017_2024-0000000000-0000000000.tif: 512 bands, 1024x1024px
  - Maricopa_embeddings_100m_2017_2024-0000000000-0000001024.tif: 512 bands, 1024x1024px
  - Maricopa_embeddings_100m_2017_2024-0000000000-0000002048.tif: 512 bands, 507x1024px
  - Maricopa_embeddings_100m_2017_2024-0000001024-0000000000.tif: 512 bands, 1024x695px
  - Maricopa_embeddings_100m_2017_2024-0000001024-0000001024.tif: 512 bands, 1024x695px


In [35]:
# GENERATE BAND NAMES (512 bands = 64 features x 8 years: 2017-2024)
band_names = []
for year in range(2017, 2025):
    for i in range(64):
        band_names.append(f"A{i:02d}_{year}")

print(f"Band names: {band_names[:3]} ... {band_names[-3:]}")
print(f"Total: {len(band_names)} bands")

Band names: ['A00_2017', 'A01_2017', 'A02_2017'] ... ['A61_2024', 'A62_2024', 'A63_2024']
Total: 512 bands


In [36]:
# CHUNKED CONVERSION - Process in parts to save memory
# FIX: Use unique chunk names for each tile!
chunk_files = []
global_chunk_idx = 0

for tile_idx, tif_path in enumerate(tif_to_use):
    print(f"\n{'='*60}")
    print(f"TILE {tile_idx+1}/{len(tif_to_use)}: {os.path.basename(tif_path)}")
    print(f"{'='*60}")
    
    with rasterio.open(tif_path) as src:
        transform = src.transform
        height, width = src.height, src.width
        n_bands = src.count
        
        # Calculate chunk size
        chunk_size = height // NUM_CHUNKS + 1
        print(f"  Rows: {height}, Chunk size: {chunk_size}")
        
        for chunk_idx in range(NUM_CHUNKS):
            start_row = chunk_idx * chunk_size
            end_row = min(start_row + chunk_size, height)
            
            if start_row >= height:
                break
            
            records = []
            
            for row in tqdm(range(start_row, end_row), desc=f"T{tile_idx+1}C{chunk_idx+1}"):
                window = rasterio.windows.Window(0, row, width, 1)
                row_data = src.read(window=window).squeeze(1)
                
                for col in range(width):
                    lng, lat = transform * (col + 0.5, row + 0.5)
                    pixel_vals = row_data[:, col]
                    
                    if np.all(np.isnan(pixel_vals)):
                        continue
                    
                    h3_index = h3.latlng_to_cell(lat, lng, H3_RES)
                    rec = {"h3_index": h3_index}
                    for i, bname in enumerate(band_names[:n_bands]):
                        rec[bname] = float(pixel_vals[i])
                    records.append(rec)
            
            # Save chunk with UNIQUE name (tile_chunk)
            df_chunk = pd.DataFrame(records)
            chunk_file = f"{PROCESSED}/tile{tile_idx:02d}_chunk{chunk_idx:02d}.parquet"
            df_chunk.to_parquet(chunk_file, index=False)
            chunk_files.append(chunk_file)
            print(f"  ✓ {chunk_file.split('/')[-1]}: {len(records):,} records")
            
            # Free memory
            del records, df_chunk
            gc.collect()
            global_chunk_idx += 1

print(f"\n✓ All {len(chunk_files)} chunks saved!")


TILE 1/5: Maricopa_embeddings_100m_2017_2024-0000000000-0000000000.tif
  Rows: 1024, Chunk size: 103


T1C1: 100%|██████████| 103/103 [00:06<00:00, 17.06it/s]


  ✓ tile00_chunk00.parquet: 36,934 records


T1C2: 100%|██████████| 103/103 [00:06<00:00, 15.30it/s]


  ✓ tile00_chunk01.parquet: 101,734 records


T1C3: 100%|██████████| 103/103 [00:09<00:00, 10.49it/s]


  ✓ tile00_chunk02.parquet: 105,391 records


T1C4: 100%|██████████| 103/103 [00:06<00:00, 15.00it/s]


  ✓ tile00_chunk03.parquet: 105,382 records


T1C5: 100%|██████████| 103/103 [00:09<00:00, 10.47it/s]


  ✓ tile00_chunk04.parquet: 105,369 records


T1C6: 100%|██████████| 103/103 [00:06<00:00, 14.97it/s]


  ✓ tile00_chunk05.parquet: 105,388 records


T1C7: 100%|██████████| 103/103 [00:06<00:00, 14.96it/s]


  ✓ tile00_chunk06.parquet: 105,444 records


T1C8: 100%|██████████| 103/103 [00:09<00:00, 10.31it/s]


  ✓ tile00_chunk07.parquet: 105,369 records


T1C9: 100%|██████████| 103/103 [00:06<00:00, 14.92it/s]


  ✓ tile00_chunk08.parquet: 105,399 records


T1C10: 100%|██████████| 97/97 [00:06<00:00, 14.91it/s]


  ✓ tile00_chunk09.parquet: 99,231 records

TILE 2/5: Maricopa_embeddings_100m_2017_2024-0000000000-0000001024.tif
  Rows: 1024, Chunk size: 103


T2C1: 100%|██████████| 103/103 [00:06<00:00, 14.91it/s]


  ✓ tile01_chunk00.parquet: 52,052 records


T2C2: 100%|██████████| 103/103 [00:06<00:00, 16.43it/s]


  ✓ tile01_chunk01.parquet: 93,313 records


T2C3: 100%|██████████| 103/103 [00:09<00:00, 10.32it/s]


  ✓ tile01_chunk02.parquet: 105,472 records


T2C4: 100%|██████████| 103/103 [00:06<00:00, 14.91it/s]


  ✓ tile01_chunk03.parquet: 105,472 records


T2C5: 100%|██████████| 103/103 [00:09<00:00, 10.43it/s]


  ✓ tile01_chunk04.parquet: 105,472 records


T2C6: 100%|██████████| 103/103 [00:06<00:00, 15.05it/s]


  ✓ tile01_chunk05.parquet: 105,472 records


T2C7: 100%|██████████| 103/103 [00:06<00:00, 15.75it/s]


  ✓ tile01_chunk06.parquet: 98,695 records


T2C8: 100%|██████████| 103/103 [00:09<00:00, 11.19it/s]


  ✓ tile01_chunk07.parquet: 95,447 records


T2C9: 100%|██████████| 103/103 [00:06<00:00, 16.81it/s]


  ✓ tile01_chunk08.parquet: 91,022 records


T2C10: 100%|██████████| 97/97 [00:02<00:00, 37.18it/s]


  ✓ tile01_chunk09.parquet: 30,473 records

TILE 3/5: Maricopa_embeddings_100m_2017_2024-0000000000-0000002048.tif
  Rows: 1024, Chunk size: 103


T3C1: 100%|██████████| 103/103 [00:02<00:00, 49.58it/s]


  ✓ tile02_chunk00.parquet: 3,034 records


T3C2: 100%|██████████| 103/103 [00:01<00:00, 95.83it/s]


  ✓ tile02_chunk01.parquet: 8,426 records


T3C3: 100%|██████████| 103/103 [00:02<00:00, 39.10it/s]


  ✓ tile02_chunk02.parquet: 12,695 records


T3C4: 100%|██████████| 103/103 [00:01<00:00, 66.20it/s]


  ✓ tile02_chunk03.parquet: 16,505 records


T3C5: 100%|██████████| 103/103 [00:03<00:00, 26.00it/s]


  ✓ tile02_chunk04.parquet: 34,854 records


T3C6: 100%|██████████| 103/103 [00:03<00:00, 31.29it/s]


  ✓ tile02_chunk05.parquet: 47,706 records


T3C7: 100%|██████████| 103/103 [00:01<00:00, 69.57it/s]


  ✓ tile02_chunk06.parquet: 15,686 records


T3C8: 100%|██████████| 103/103 [00:01<00:00, 55.54it/s]


  ✓ tile02_chunk07.parquet: 0 records


T3C9: 100%|██████████| 103/103 [00:00<00:00, 178.17it/s]


  ✓ tile02_chunk08.parquet: 0 records


T3C10: 100%|██████████| 97/97 [00:00<00:00, 181.13it/s]


  ✓ tile02_chunk09.parquet: 0 records

TILE 4/5: Maricopa_embeddings_100m_2017_2024-0000001024-0000000000.tif
  Rows: 695, Chunk size: 70


T4C1: 100%|██████████| 70/70 [00:07<00:00,  9.08it/s]


  ✓ tile03_chunk00.parquet: 71,610 records


T4C2: 100%|██████████| 70/70 [00:04<00:00, 16.64it/s]


  ✓ tile03_chunk01.parquet: 71,610 records


T4C3: 100%|██████████| 70/70 [00:04<00:00, 16.66it/s]


  ✓ tile03_chunk02.parquet: 71,610 records


T4C4: 100%|██████████| 70/70 [00:06<00:00, 10.12it/s]


  ✓ tile03_chunk03.parquet: 71,610 records


T4C5: 100%|██████████| 70/70 [00:04<00:00, 16.52it/s]


  ✓ tile03_chunk04.parquet: 71,610 records


T4C6: 100%|██████████| 70/70 [00:04<00:00, 16.45it/s]


  ✓ tile03_chunk05.parquet: 71,610 records


T4C7: 100%|██████████| 70/70 [00:04<00:00, 16.46it/s]


  ✓ tile03_chunk06.parquet: 71,610 records


T4C8: 100%|██████████| 70/70 [00:06<00:00, 10.73it/s]


  ✓ tile03_chunk07.parquet: 71,610 records


T4C9: 100%|██████████| 70/70 [00:04<00:00, 16.70it/s]


  ✓ tile03_chunk08.parquet: 71,610 records


T4C10: 100%|██████████| 65/65 [00:03<00:00, 16.65it/s]


  ✓ tile03_chunk09.parquet: 66,326 records

TILE 5/5: Maricopa_embeddings_100m_2017_2024-0000001024-0000001024.tif
  Rows: 695, Chunk size: 70


T5C1: 100%|██████████| 70/70 [00:04<00:00, 17.47it/s]


  ✓ tile04_chunk00.parquet: 16,660 records


T5C2: 100%|██████████| 70/70 [00:01<00:00, 47.93it/s]


  ✓ tile04_chunk01.parquet: 16,632 records


T5C3: 100%|██████████| 70/70 [00:01<00:00, 47.92it/s]


  ✓ tile04_chunk02.parquet: 16,588 records


T5C4: 100%|██████████| 70/70 [00:03<00:00, 17.72it/s]


  ✓ tile04_chunk03.parquet: 16,520 records


T5C5: 100%|██████████| 70/70 [00:01<00:00, 47.90it/s]


  ✓ tile04_chunk04.parquet: 16,520 records


T5C6: 100%|██████████| 70/70 [00:01<00:00, 45.14it/s]


  ✓ tile04_chunk05.parquet: 16,520 records


T5C7: 100%|██████████| 70/70 [00:01<00:00, 47.27it/s]


  ✓ tile04_chunk06.parquet: 16,520 records


T5C8: 100%|██████████| 70/70 [00:03<00:00, 19.11it/s]


  ✓ tile04_chunk07.parquet: 16,520 records


T5C9: 100%|██████████| 70/70 [00:01<00:00, 44.95it/s]


  ✓ tile04_chunk08.parquet: 16,520 records


T5C10: 100%|██████████| 65/65 [00:01<00:00, 46.28it/s]


  ✓ tile04_chunk09.parquet: 14,952 records

✓ All 50 chunks saved!


In [37]:
# MERGE ALL CHUNKS
print(f"Loading {len(chunk_files)} chunk files...")

chunks = []
for f in tqdm(chunk_files, desc="Loading"):
    chunks.append(pd.read_parquet(f))

print("Concatenating...")
df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f"Total records: {len(df):,}")
print("Aggregating by H3 index...")
df = df.groupby("h3_index").mean().reset_index()

# Save final output
output_path = f"{PROCESSED}/embeddings_h3.parquet"
df.to_parquet(output_path, index=False)

print(f"\n{'='*50}")
print(f"DONE: {output_path}")
print(f"{'='*50}")
print(f"H3 cells: {len(df):,}")
print(f"Columns: {len(df.columns)}")

Loading 50 chunk files...


Loading: 100%|██████████| 50/50 [00:02<00:00, 19.86it/s]


Concatenating...
Total records: 2,872,205
Aggregating by H3 index...

DONE: /home/network-lab/Desktop/EWRI/processed/maricopa/embeddings_h3.parquet
H3 cells: 197,573
Columns: 513


In [38]:
# CLEANUP CHUNK FILES
for f in chunk_files:
    if os.path.exists(f):
        os.remove(f)
print(f"✓ Deleted {len(chunk_files)} chunk files")

✓ Deleted 50 chunk files


In [39]:
# VERIFY OUTPUT
print("DATA QUALITY CHECK")
print("="*50)

print(f"\nColumns: {list(df.columns[:5])} ... {list(df.columns[-3:])}")

years = [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
print("\nData by year:")
for year in years:
    year_cols = [c for c in df.columns if c.endswith(f"_{year}")]
    if year_cols:
        valid_pct = (df[year_cols].notna().sum().sum() / df[year_cols].size) * 100
        print(f"  {year}: {len(year_cols)} bands, {valid_pct:.1f}% valid")

embed_cols = [c for c in df.columns if c != "h3_index"]
print(f"\nValue range:")
print(f"  Min: {df[embed_cols].min().min():.4f}")
print(f"  Max: {df[embed_cols].max().max():.4f}")
print(f"  Mean: {df[embed_cols].mean().mean():.4f}")

DATA QUALITY CHECK

Columns: ['h3_index', 'A00_2017', 'A01_2017', 'A02_2017', 'A03_2017'] ... ['A61_2024', 'A62_2024', 'A63_2024']

Data by year:
  2017: 64 bands, 100.0% valid
  2018: 64 bands, 100.0% valid
  2019: 64 bands, 100.0% valid
  2020: 64 bands, 100.0% valid
  2021: 64 bands, 100.0% valid
  2022: 64 bands, 100.0% valid
  2023: 64 bands, 100.0% valid
  2024: 64 bands, 100.0% valid

Value range:
  Min: -0.5260
  Max: 0.4500
  Mean: 0.0053


In [40]:
# GEOGRAPHIC COVERAGE CHECK
print("\nGEOGRAPHIC COVERAGE")
print("="*50)

lats, lngs = [], []
for h in df['h3_index'].values:
    lat, lng = h3.cell_to_latlng(h)
    lats.append(lat)
    lngs.append(lng)

print(f"  South: {min(lats):.4f}°")
print(f"  North: {max(lats):.4f}°")
print(f"  West:  {min(lngs):.4f}°")
print(f"  East:  {max(lngs):.4f}°")


GEOGRAPHIC COVERAGE
  South: 32.5034°
  North: 34.0484°
  West:  -113.3369°
  East:  -111.0395°
