In [None]:
# Ensure project root is on sys.path so `services` package can be imported
import sys
from pathlib import Path
sys.path.insert(0, str(Path('..').resolve()))
print('Added project root to sys.path:', Path('..').resolve())


# Batch Grid Interpolation and Upload
This notebook builds interpolated grids for the 10 most recent timestamps from `clean_measurements`, creates contour metadata and JPEG previews, saves compressed NPZ files locally under `data/processed`, and uploads artifacts to blob storage using the ETL `GridBuilder` and `BlobUploader`.

Notes:
- Reads configuration from `services/etl/config.py` which loads `.env` (so ensure `.env` in repository root contains DB and Vercel blob credentials).
- Respects `dry_run` by not uploading when enabled.
- Saves local NPZ files named `grid_<TIMESTAMP>.npz` in `data/processed`.


In [None]:
from __future__ import annotations

import json
from pathlib import Path
from typing import List
import gzip

import numpy as np
import pandas as pd
import sqlalchemy as sa
from dotenv import load_dotenv

# Ensure project root .env is loaded for DB + blob credentials
load_dotenv(Path('..') / '.env')

BASE_DIR = Path('..').resolve()
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print('Base dir:', BASE_DIR)


In [None]:
# Import ETL helper classes
from services.etl.config import load as load_config
from services.etl.grid_builder import GridBuilder, GridArtifacts
from services.etl.uploader import BlobUploader
from services.etl.db import Database
from services.etl.contours import generate_contours_geojson

cfg = load_config()
# Override dry_run to actually perform uploads when running this notebook interactively.
# Remove or comment this line if you want to test without uploading.
cfg.dry_run = False
print('Config loaded. dry_run =', cfg.dry_run)

uploader = BlobUploader(cfg)
builder = GridBuilder(res_m=cfg.grid_resolution_m, padding_m=cfg.grid_padding_m)
db = Database(cfg)

# Use the Database engine for any direct SQL access if needed
engine = db.engine


## Fetch 10 most recent timestamps
We query `clean_measurements` grouping by timestamp and select the 10 most recent timestamps by count or simply by timestamp value.

In [None]:
# Force mode: set to True to bypass `grid_runs` pending-slot logic and process the
# top-N timestamps by highest mean value directly. Use with caution — DB status
# updates will be performed for forced entries by creating grid_runs rows if they
# don't already exist.
FORCE = True
FORCE_TOP_N = 10


def _get_or_create_run_for_slot(slot_ts: pd.Timestamp, conn) -> int:
    """Return existing grid_runs.id for (ts, res_m) or insert a new pending row and return its id."""
    # Try to find an existing run for the exact slot timestamp and resolution
    sel = sa.text("SELECT id FROM grid_runs WHERE ts = :ts AND res_m = :res_m LIMIT 1")
    row = conn.execute(sel, {"ts": slot_ts, "res_m": cfg.grid_resolution_m}).fetchone()
    if row and row[0] is not None:
        return int(row[0])
    # Insert a new pending run
    ins = sa.text(
        "INSERT INTO grid_runs (ts, res_m, bbox, crs, status, created_at, updated_at) "
        "VALUES (:ts, :res_m, '[]'::jsonb, 'EPSG:3857', 'pending', NOW(), NOW()) RETURNING id"
    )
    new = conn.execute(ins, {"ts": slot_ts, "res_m": cfg.grid_resolution_m}).fetchone()
    return int(new[0])


if FORCE:
    print('FORCE mode enabled: fetching top', FORCE_TOP_N, 'timestamps by mean value')
    qry = sa.text('''
        SELECT ts AT TIME ZONE 'UTC' AS ts_utc
        FROM (
            SELECT ts, AVG(value_mm) as mean_mm
            FROM clean_measurements
            GROUP BY ts
            ORDER BY mean_mm DESC
            LIMIT :limit
        ) t
        ORDER BY mean_mm DESC
    ''')
    rows = pd.read_sql(qry, engine, params={'limit': FORCE_TOP_N})
    rows['ts_utc'] = pd.to_datetime(rows['ts_utc'], utc=True)

    # Create or find grid_runs entries for each forced timestamp so we can update DB status
    forced = []
    with db.engine.begin() as conn:
        for ts in rows['ts_utc'].tolist():
            run_id = _get_or_create_run_for_slot(ts, conn)
            forced.append((run_id, ts))

    pending_slots = forced
    print('Created/located', len(pending_slots), 'grid_runs entries for forced timestamps')
    for rid, ts in pending_slots:
        print(rid, ts)
else:
    # Use the DB helper to ensure slots exist and fetch pending slots to process
    # This mirrors the ETL service behavior and ensures the SQL table `grid_runs` is updated.
    print('Ensuring slots...')
    db.ensure_slots()

    pending = db.fetch_pending_slots()
    # pending is a list of (run_id, timestamp)
    print('Found pending slots:', len(pending))
    for rid, ts in pending:
        print(rid, ts)

    # Expose ids and timestamps for the loop below
    pending_slots = pending


## Helper: load snapshot for a timestamp
We reuse the same 5-minute window approach as the single-file notebook to gather sensor measurements around each timestamp.

In [None]:
from datetime import timedelta

def ensure_utc(value) -> pd.Timestamp:
    ts = pd.Timestamp(value)
    if ts.tzinfo is None:
        return ts.tz_localize('UTC')
    return ts.tz_convert('UTC')

def load_clean_snapshot(target_ts, window_minutes=5):
    ts = ensure_utc(target_ts)
    start = ts - pd.Timedelta(minutes=window_minutes)
    end = ts + pd.Timedelta(minutes=window_minutes)
    query = sa.text("""
        SELECT cm.sensor_id, cm.ts, cm.value_mm, cm.imputation_method,
               s.lat, s.lon
        FROM clean_measurements cm
        JOIN sensors s ON s.id = cm.sensor_id
        WHERE cm.ts BETWEEN :start AND :end
    """)
    df = pd.read_sql(query, engine, params={'start': start, 'end': end})
    df['ts'] = pd.to_datetime(df['ts'], utc=True)
    return df.sort_values('sensor_id')


## Loop over timestamps: build + save + upload
For each timestamp we will build the grid, save a local NPZ, upload the NPZ, upload a gzipped JSON grid payload via `upload_grid_json`, and upload the JPEG preview if available.

In [None]:
from datetime import datetime

# Helper to save NPZ
def save_npz(local_path: Path, artifacts: GridArtifacts):
    np.savez_compressed(
        local_path,
        data=artifacts.data_grid.astype(np.float32),
        x=artifacts.x_coords.astype(np.float64),
        y=artifacts.y_coords.astype(np.float64),
        metadata=artifacts.metadata_json,
    )

# Loop over pending_slots (run_id, slot)
processed: list = []
errors: list = []
for run_id, slot in pending_slots:
    ts = slot
    print('---')
    print('Processing slot', slot, 'id', run_id)
    try:
        snapshot = db.load_snapshot(slot)
        if snapshot.empty:
            raise ValueError('no clean measurements for slot')

        artifact = builder.build(snapshot)

        timestamp = slot.strftime('%Y%m%dT%H%M%SZ')
        base_key = f'grids/{timestamp}'
        local_dir = PROCESSED_DIR / timestamp
        local_dir.mkdir(parents=True, exist_ok=True)

        # Save local NPZ
        local_npz = local_dir / 'grid.npz'
        save_npz(local_npz, artifact)
        print('Saved local NPZ to', local_npz)

        if cfg.dry_run:
            print('dry-run: would upload artifacts for slot', slot, 'grid shape', artifact.data_grid.shape)
            db.mark_success(run_id, json.dumps(list(artifact.bbox_3857)), npz_url='', json_url='', contours_url='', message='dry-run')
            processed.append(timestamp)
            continue

        # Upload NPZ using uploader.upload_npz which expects a dict of arrays
        npz_payload = {
            'data': artifact.data_grid.astype(np.float32),
            'x': artifact.x_coords.astype(np.float64),
            'y': artifact.y_coords.astype(np.float64),
            'metadata': np.array([artifact.metadata_json]),
        }
        npz_url = uploader.upload_npz(f'{base_key}/grid.npz', npz_payload)
        print('Uploaded NPZ ->', npz_url)

        # Upload grid json gzip
        grid_json_url = uploader.upload_grid_json(f'{base_key}/grid.json.gz', artifact)
        print('Uploaded grid JSON gzip ->', grid_json_url)

        # Generate and upload contours geojson
        contour_bytes = generate_contours_geojson(artifact.x_coords, artifact.y_coords, artifact.data_grid, artifact.thresholds)
        contours_url = uploader.upload_bytes(f'{base_key}/contours.geojson', contour_bytes, 'application/geo+json')
        print('Uploaded contours ->', contours_url)

        # Upload JPEG preview if available
        jpeg_url = None
        if getattr(artifact, 'jpeg_bytes', None):
            try:
                jpeg_url = uploader.upload_bytes(f'{base_key}/preview.jpg', artifact.jpeg_bytes, 'image/jpeg')
                print('Uploaded JPEG preview ->', jpeg_url)
            except Exception:
                jpeg_url = None

        # Update grids/latest.json pointer
        metadata = json.loads(artifact.metadata_json)
        latest_payload = {
            'timestamp': metadata['timestamp'],
            'grid_npz_url': npz_url,
            'grid_json_url': grid_json_url,
            'grid_preview_jpeg_url': jpeg_url,
            'contours_url': contours_url,
            'res_m': cfg.grid_resolution_m,
            'bbox': metadata['bbox_wgs84'],
            'intensity_classes': metadata.get('intensity_classes', []),
            'intensity_thresholds': metadata.get('intensity_thresholds', []),
        }
        latest_url = uploader.upload_json('grids/latest.json', latest_payload)
        print('Updated latest pointer ->', latest_url)

        # Mark success in DB
        db.mark_success(run_id, json.dumps(list(artifact.bbox_3857)), npz_url=npz_url, json_url=grid_json_url, contours_url=contours_url)

        processed.append(timestamp)
        print('slot processed:', slot.isoformat(), base_key)

    except Exception as exc:
        print('slot failed:', slot, exc)
        db.mark_failure(run_id, str(exc))
        errors.append({'run_id': run_id, 'timestamp': str(slot), 'error': str(exc)})

print('Done. processed:', processed)
print('errors:', errors)


## Result
The notebook created and uploaded (unless `dry_run` is set) the grids. Check `data/processed` for saved `.npz` files and your Vercel blob storage for uploaded artifacts.