# Dataset Preparation for Recovery Modeling (DFGCN-style)
This notebook converts your CSV inputs into model-ready tensors for ST-GNN / Temporal Transformer recovery forecasting.

**Inputs:** shelters.csv, hospitals.csv, schools.csv, vulnerability_grid.csv
**Optional:** weather_ensemble.csv, recovery_labels.csv

**Outputs (saved to OUT_DIR):** nodes.csv, edges.csv, X_static.npy, edge_index.npy, edge_weight.npy, X_dynamic.npy (if weather provided), Y.npy (if labels provided), node_ids.json


In [1]:
!pip -q install pandas numpy scikit-learn

In [2]:
import os, json
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree, NearestNeighbors


In [2]:
pip install geopandas fiona pyogrio


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import geopandas as gpd
import pyogrio
import os

GDB_PATH = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\vulnerability\\SVI2022_FLORIDA_tract.gdb"
OUT_CSV  = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\vulnerability\\vulnerability_grid.csv"

# List layers using pyogrio (this is the key fix)
layers = pyogrio.list_layers(GDB_PATH)
print("Layers found:")
for l in layers:
    print(l)

# Usually the first layer contains tract-level SVI data
layer_name = layers[0][0]
print("Using layer:", layer_name)

# Read the layer
gdf = gpd.read_file(GDB_PATH, layer=layer_name)

print("Rows:", len(gdf))
print("Columns:", list(gdf.columns))

# Drop geometry (we only need attributes for modeling)
df = gdf.drop(columns="geometry")

# Save to CSV
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
df.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)


Layers found:
['SVI2022_FLORIDA_tract' 'MultiPolygon']
Using layer: SVI2022_FLORIDA_tract
Rows: 5122
Columns: ['ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION', 'AREA_SQMI', 'E_TOTPOP', 'M_TOTPOP', 'E_HU', 'M_HU', 'E_HH', 'M_HH', 'E_POV150', 'M_POV150', 'E_UNEMP', 'M_UNEMP', 'E_HBURD', 'M_HBURD', 'E_NOHSDP', 'M_NOHSDP', 'E_UNINSUR', 'M_UNINSUR', 'E_AGE65', 'M_AGE65', 'E_AGE17', 'M_AGE17', 'E_DISABL', 'M_DISABL', 'E_SNGPNT', 'M_SNGPNT', 'E_LIMENG', 'M_LIMENG', 'E_MINRTY', 'M_MINRTY', 'E_MUNIT', 'M_MUNIT', 'E_MOBILE', 'M_MOBILE', 'E_CROWD', 'M_CROWD', 'E_NOVEH', 'M_NOVEH', 'E_GROUPQ', 'M_GROUPQ', 'EP_POV150', 'MP_POV150', 'EP_UNEMP', 'MP_UNEMP', 'EP_HBURD', 'MP_HBURD', 'EP_NOHSDP', 'MP_NOHSDP', 'EP_UNINSUR', 'MP_UNINSUR', 'EP_AGE65', 'MP_AGE65', 'EP_AGE17', 'MP_AGE17', 'EP_DISABL', 'MP_DISABL', 'EP_SNGPNT', 'MP_SNGPNT', 'EP_LIMENG', 'MP_LIMENG', 'EP_MINRTY', 'MP_MINRTY', 'EP_MUNIT', 'MP_MUNIT', 'EP_MOBILE', 'MP_MOBILE', 'EP_CROWD', 'MP_CROWD', 'EP_NOVEH', 'MP_NOVEH', 'EP_

In [7]:
# ==============================
# SVI -> vulnerability_grid.csv -> node-level SVI features (direct code)
# Works with: data/raw/vulnerability/vulnerability_grid.csv
# Outputs:
#   1) data/processed/vulnerability_grid_clean.csv
#   2) (optional) merges into your nodes dataframe as node_svi_features
# ==============================

import os
import numpy as np
import pandas as pd

# ---- paths ----
VULN_GRID_CSV_IN  = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\vulnerability\\vulnerability_grid.csv"      # produced from .gdb using pyogrio
VULN_GRID_CSV_OUT = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\vulnerability\\vulnerability_grid_clean.csv"

# If you already have nodes with lat/lon (recommended), set this:
# Example: "recovery_dataset_out/nodes.csv" or "data/processed/nodes.csv"
NODES_CSV = None  # e.g., "recovery_dataset_out/nodes.csv"
NODES_OUT = None  # e.g., "recovery_dataset_out/nodes_with_svi.csv"

# ---- chosen SVI features (standard, reviewer-safe) ----
SVI_FEATURES = ["RPL_THEME1", "RPL_THEME2", "RPL_THEME3", "RPL_THEME4", "RPL_THEMES"]

# ---- helper: robust numeric conversion ----
def to_num(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce")

# ---- 1) Load and clean vulnerability grid ----
vuln = pd.read_csv(VULN_GRID_CSV_IN, dtype=str)

# Required identifier columns vary slightly by release; we only strictly need FIPS
if "FIPS" not in vuln.columns:
    raise ValueError("Expected 'FIPS' column in vulnerability_grid.csv. Please open the CSV and check column names.")

# Convert SVI columns to numeric and clean
for c in SVI_FEATURES:
    if c not in vuln.columns:
        raise ValueError(f"Missing SVI column '{c}'. Available columns include: {list(vuln.columns)[:30]} ...")
    vuln[c] = to_num(vuln[c])

# Drop rows with missing key SVI values
vuln_clean = vuln[["FIPS"] + SVI_FEATURES].dropna(subset=SVI_FEATURES).copy()

# Clip to [0,1] just in case (RPL_* are percentiles, should already be 0..1)
for c in SVI_FEATURES:
    vuln_clean[c] = vuln_clean[c].clip(0.0, 1.0)

# Save cleaned vulnerability grid
os.makedirs(os.path.dirname(VULN_GRID_CSV_OUT), exist_ok=True)
vuln_clean.to_csv(VULN_GRID_CSV_OUT, index=False)
print("✅ Saved cleaned vulnerability grid:", VULN_GRID_CSV_OUT)
print("Rows:", len(vuln_clean), "| Feature columns:", SVI_FEATURES)

# ---- 2) OPTIONAL: merge SVI into your nodes table (if you have tract FIPS per node)
# This requires your nodes.csv to have either:
#   - a column named 'FIPS', OR
#   - a column named 'tract_fips', OR
#   - any column you can rename to 'FIPS'
#
# If you do NOT have tract FIPS in nodes yet, you need a spatial join:
#   node (lat/lon) -> tract polygon -> tract FIPS
# That step requires a tract boundary shapefile/geojson. Tell me if you want that.
# ----
if NODES_CSV is not None and NODES_OUT is not None:
    nodes = pd.read_csv(NODES_CSV)

    # Find a plausible join key in nodes
    join_key = None
    for cand in ["FIPS", "tract_fips", "TRACT_FIPS", "GEOID", "geoid"]:
        if cand in nodes.columns:
            join_key = cand
            break

    if join_key is None:
        raise ValueError(
            "Nodes CSV does not contain a tract id column (FIPS/GEOID). "
            "Add tract FIPS to nodes first (spatial join), then re-run this merge."
        )

    # Normalize to string 11-digit tract FIPS where possible
    nodes["__FIPS__"] = nodes[join_key].astype(str).str.replace(r"\.0$", "", regex=True).str.zfill(11)
    vuln_clean["__FIPS__"] = vuln_clean["FIPS"].astype(str).str.replace(r"\.0$", "", regex=True).str.zfill(11)

    merged = nodes.merge(vuln_clean[["__FIPS__"] + SVI_FEATURES], on="__FIPS__", how="left")

    # If any missing, fill with median (reasonable default)
    for c in SVI_FEATURES:
        if merged[c].isna().any():
            med = float(np.nanmedian(merged[c].values))
            merged[c] = merged[c].fillna(med)

    merged = merged.drop(columns=["__FIPS__"])

    os.makedirs(os.path.dirname(NODES_OUT), exist_ok=True)
    merged.to_csv(NODES_OUT, index=False)
    print("✅ Saved nodes with SVI features:", NODES_OUT)
    print("Merged node rows:", len(merged))

# ---- 3) Quick tensor export (optional): save SVI feature matrix for GNN
# If you have a nodes_with_svi.csv already and want an X_static.npy:
#   set NODES_WITH_SVI_CSV and run this block
# ----
NODES_WITH_SVI_CSV = None  # e.g., "recovery_dataset_out/nodes_with_svi.csv"
X_STATIC_OUT = None        # e.g., "recovery_dataset_out/X_svi.npy"

if NODES_WITH_SVI_CSV is not None and X_STATIC_OUT is not None:
    ndf = pd.read_csv(NODES_WITH_SVI_CSV)
    X_svi = ndf[SVI_FEATURES].astype(float).to_numpy(dtype=np.float32)
    os.makedirs(os.path.dirname(X_STATIC_OUT), exist_ok=True)
    np.save(X_STATIC_OUT, X_svi)
    print("✅ Saved SVI feature matrix:", X_STATIC_OUT, "| shape:", X_svi.shape)


✅ Saved cleaned vulnerability grid: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\vulnerability_grid_clean.csv
Rows: 5122 | Feature columns: ['RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'RPL_THEMES']


## 1) Set paths and output folder

In [9]:
import os
import pandas as pd

weather_df = None
if WEATHER_ENSEMBLE_CSV and os.path.exists(WEATHER_ENSEMBLE_CSV):
    weather_df = pd.read_csv(WEATHER_ENSEMBLE_CSV)

labels_df = None
if RECOVERY_LABELS_CSV and os.path.exists(RECOVERY_LABELS_CSV):
    labels_df = pd.read_csv(RECOVERY_LABELS_CSV)


In [11]:
import numpy as np, pandas as pd, os

OUT = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\weather_ensemble.csv"
T = 24          # time steps
N = 200         # nodes (set to your node count)
E = 10          # ensemble members

rows = []
for t in range(T):
    for node_id in range(N):
        base_wind = 10 + 20*np.sin(t/6) + np.random.randn()*1.0
        base_prec = max(0, 5*np.cos(t/4) + np.random.randn()*0.5)
        base_mslp = 1010 - 20*np.sin(t/8) + np.random.randn()*0.8
        for ens in range(E):
            rows.append([t, node_id, ens,
                         base_wind + np.random.randn()*2,
                         max(0, base_prec + np.random.randn()*0.4),
                         base_mslp + np.random.randn()*1.5])

df = pd.DataFrame(rows, columns=["t","node_id","ens","wind10m","precip","mslp"])
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df.to_csv(OUT, index=False)
print("Saved:", OUT, "shape:", df.shape)


Saved: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\weather_ensemble.csv shape: (48000, 6)


In [16]:
import os
import pandas as pd
import geopandas as gpd
import pyogrio

# ---- Paths (edit if needed) ----
SHELTERS_CSV  = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\facilities\\shelters.csv"
HOSPITALS_CSV = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\facilities\\hospitals.csv"
SCHOOLS_CSV   = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\facilities\\schools.csv"

GDB_PATH = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\vulnerability\\SVI2022_FLORIDA_tract.gdb"
OUT_FAC_SVI = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\processed\\facility_svi.csv"

SVI_FEATURES = ["RPL_THEME1","RPL_THEME2","RPL_THEME3","RPL_THEME4","RPL_THEMES"]

def detect_latlon(df):
    candidates = [
        ("latitude","longitude"),
        ("lat","lon"),
        ("LAT","LON"),
        ("Latitude","Longitude"),
        ("Y","X"),
        ("y","x")
    ]
    for a,b in candidates:
        if a in df.columns and b in df.columns:
            return a,b
    raise ValueError(f"Could not detect lat/lon columns. Columns: {list(df.columns)}")

def load_facilities(path, facility_type):
    df = pd.read_csv(path)
    lat_col, lon_col = detect_latlon(df)
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs="EPSG:4326"
    )
    gdf["facility_type"] = facility_type
    # create a stable id
    gdf["facility_id"] = facility_type + "_" + gdf.index.astype(str)
    return gdf

# Load facilities
fac = pd.concat([
    load_facilities(SHELTERS_CSV, "shelter"),
    load_facilities(HOSPITALS_CSV, "hospital"),
    load_facilities(SCHOOLS_CSV, "school")
], ignore_index=True)

# Load SVI tract polygons from GDB
layers = pyogrio.list_layers(GDB_PATH)
layer_name = layers[0][0]
print("Using SVI layer:", layer_name)

tracts = gpd.read_file(GDB_PATH, layer=layer_name).to_crs("EPSG:4326")

# Ensure SVI columns exist
missing = [c for c in SVI_FEATURES if c not in tracts.columns]
if missing:
    raise ValueError(f"Missing SVI columns in tract data: {missing}")

# Spatial join facility points -> tracts
joined = gpd.sjoin(
    fac,
    tracts[SVI_FEATURES + ["geometry"]],
    how="left",
    predicate="within"
)

# Clean / fill missing SVI
for c in SVI_FEATURES:
    joined[c] = pd.to_numeric(joined[c], errors="coerce")
    joined[c] = joined[c].fillna(joined[c].median()).clip(0, 1)

# Save facility-level SVI
out = joined.drop(columns="geometry")[["facility_id","facility_type"] + SVI_FEATURES].copy()
os.makedirs(os.path.dirname(OUT_FAC_SVI), exist_ok=True)
out.to_csv(OUT_FAC_SVI, index=False)

print("✅ Saved:", OUT_FAC_SVI, "shape:", out.shape)
out.head()


Using SVI layer: SVI2022_FLORIDA_tract
✅ Saved: C:\Users\Adrija\Downloads\DFGCN\data\processed\facility_svi.csv shape: (7710, 7)


Unnamed: 0,facility_id,facility_type,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES
0,shelter_0,shelter,0.1097,0.6734,0.5022,0.3905,0.2994
1,shelter_1,shelter,0.5787,0.0008,0.5254,0.8065,0.3341
2,shelter_2,shelter,0.0574,0.5521,0.3959,0.5821,0.2783
3,shelter_3,shelter,0.5118,0.3832,0.5423,0.8684,0.6544
4,shelter_4,shelter,0.0339,0.0357,0.3909,0.2744,0.0519


In [17]:
import os
import numpy as np
import pandas as pd

WEA = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\data\\raw\\weather_ensemble.csv"            # must exist if using this option
FAC_SVI = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\processed\\facility_svi.csv"
OUT = "C:\\Users\\Adrija\\Downloads\\DFGCN\\data\\raw\\recovery_labels.csv"

fac = pd.read_csv(FAC_SVI)[["facility_id","facility_type","RPL_THEMES"]]
w = pd.read_csv(WEA)

# If weather is node-based, we don't have facility->node mapping.
# For demo: aggregate to time only (statewide mean), then apply facility vulnerability.
w_t = w.groupby("t", as_index=False)[["wind10m","precip","mslp"]].mean()

# Normalize hazard
wind_norm = (w_t["wind10m"] - w_t["wind10m"].min()) / (w_t["wind10m"].max() - w_t["wind10m"].min() + 1e-9)
prec_norm = (w_t["precip"] - w_t["precip"].min()) / (w_t["precip"].max() - w_t["precip"].min() + 1e-9)
damage_t = 0.6*wind_norm + 0.4*prec_norm

rows = []
T = len(w_t)
for _, r in fac.iterrows():
    vuln = float(np.clip(r["RPL_THEMES"], 0, 1))
    for i, t in enumerate(w_t["t"].values):
        time_factor = i / (T - 1 + 1e-9)
        base = 1.0 - (0.7*damage_t.iloc[i] + 0.3*vuln)
        recovery = np.clip(base + 0.6*time_factor*(1.0 - vuln), 0, 1)
        rows.append([t, r["facility_id"], r["facility_type"], recovery])

labels = pd.DataFrame(rows, columns=["t","facility_id","facility_type","recovery_index"])
os.makedirs(os.path.dirname(OUT), exist_ok=True)
labels.to_csv(OUT, index=False)
print("✅ Saved:", OUT, labels.shape)
labels.head()


✅ Saved: C:\Users\Adrija\Downloads\DFGCN\data\raw\recovery_labels.csv (185040, 4)


Unnamed: 0,t,facility_id,facility_type,recovery_index
0,0,shelter_0,shelter,0.464446
1,1,shelter_0,shelter,0.457115
2,2,shelter_0,shelter,0.456879
3,3,shelter_0,shelter,0.475324
4,4,shelter_0,shelter,0.516928


In [19]:
import os

# --------- Use ONE base folder to avoid duplicated paths ----------
BASE = r"C:\Users\Adrija\Downloads\DFGCN"

SHELTERS_CSV  = os.path.join(BASE, r"data\raw\facilities\shelters.csv")
HOSPITALS_CSV = os.path.join(BASE, r"data\raw\facilities\hospitals.csv")
SCHOOLS_CSV   = os.path.join(BASE, r"data\raw\facilities\schools.csv")

VULN_GRID_CSV = os.path.join(BASE, r"data\raw\vulnerability\vulnerability_grid.csv")

# Optional (only if the file exists)
WEATHER_ENSEMBLE_CSV = os.path.join(BASE, r"data\raw\weather_ensemble.csv")   # t,node_id,ens,...
RECOVERY_LABELS_CSV  = os.path.join(BASE, r"data\raw\recovery_labels.csv")    # depends on your labeling format

# Output directory (MUST be defined)
OUT_DIR = os.path.join(BASE, "recovery_dataset_out")
os.makedirs(OUT_DIR, exist_ok=True)
print("OUT_DIR:", OUT_DIR)

# Safety checks (recommended)
print("Shelters exists:", os.path.exists(SHELTERS_CSV))
print("Hospitals exists:", os.path.exists(HOSPITALS_CSV))
print("Schools exists:", os.path.exists(SCHOOLS_CSV))
print("Vuln exists:", os.path.exists(VULN_GRID_CSV))
print("Weather exists:", os.path.exists(WEATHER_ENSEMBLE_CSV))
print("Labels exists:", os.path.exists(RECOVERY_LABELS_CSV))


OUT_DIR: C:\Users\Adrija\Downloads\DFGCN\recovery_dataset_out
Shelters exists: False
Hospitals exists: False
Schools exists: False
Vuln exists: False
Weather exists: False
Labels exists: True


In [21]:
import os
import glob

BASE = r"C:\Users\Adrija\Downloads\DFGCN"

def find_one(pattern):
    hits = glob.glob(os.path.join(BASE, "**", pattern), recursive=True)
    if not hits:
        print("NOT FOUND:", pattern)
        return None
    # pick shortest path (usually the correct one)
    hits = sorted(hits, key=len)
    print("FOUND:", pattern, "->", hits[0])
    return hits[0]

SHELTERS_CSV  = find_one("shelters.csv")
HOSPITALS_CSV = find_one("hospitals.csv")
SCHOOLS_CSV   = find_one("schools.csv")

# vulnerability file (you have both)
VULN_GRID_CSV = find_one("vulnerability_grid_clean.csv") or find_one("vulnerability_grid.csv")

WEATHER_ENSEMBLE_CSV = find_one("weather_ensemble.csv")  # optional

# you already have this
RECOVERY_LABELS_CSV = find_one("recovery_labels.csv")

OUT_DIR = os.path.join(BASE, "recovery_dataset_out")
os.makedirs(OUT_DIR, exist_ok=True)

print("\n--- Final Paths ---")
print("SHELTERS_CSV:", SHELTERS_CSV)
print("HOSPITALS_CSV:", HOSPITALS_CSV)
print("SCHOOLS_CSV:", SCHOOLS_CSV)
print("VULN_GRID_CSV:", VULN_GRID_CSV)
print("WEATHER_ENSEMBLE_CSV:", WEATHER_ENSEMBLE_CSV)
print("RECOVERY_LABELS_CSV:", RECOVERY_LABELS_CSV)
print("OUT_DIR:", OUT_DIR)


FOUND: shelters.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\shelters.csv
FOUND: hospitals.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\hospitals.csv
FOUND: schools.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\schools.csv
FOUND: vulnerability_grid_clean.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\vulnerability_grid_clean.csv
FOUND: weather_ensemble.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\weather_ensemble.csv
FOUND: recovery_labels.csv -> C:\Users\Adrija\Downloads\DFGCN\data\raw\recovery_labels.csv

--- Final Paths ---
SHELTERS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\shelters.csv
HOSPITALS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\hospitals.csv
SCHOOLS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\schools.csv
VULN_GRID_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\vulnerability_grid_c

In [22]:
import pandas as pd
labels = pd.read_csv(RECOVERY_LABELS_CSV)
print(labels.columns.tolist())
print(labels.head())


['t', 'facility_id', 'facility_type', 'recovery_index']
   t facility_id facility_type  recovery_index
0  0   shelter_0       shelter        0.464446
1  1   shelter_0       shelter        0.457115
2  2   shelter_0       shelter        0.456879
3  3   shelter_0       shelter        0.475324
4  4   shelter_0       shelter        0.516928


## 2) Load & standardize CSVs

In [23]:
import os, glob

BASE = r"C:\Users\Adrija\Downloads\DFGCN"

def find_one(filename):
    hits = glob.glob(os.path.join(BASE, "**", filename), recursive=True)
    if not hits:
        raise FileNotFoundError(f"Could not find {filename} anywhere under {BASE}")
    hits = sorted(hits, key=len)
    return hits[0]

SHELTERS_CSV  = find_one("shelters.csv")
HOSPITALS_CSV = find_one("hospitals.csv")
SCHOOLS_CSV   = find_one("schools.csv")

# vulnerability: choose clean if present
try:
    VULN_GRID_CSV = find_one("vulnerability_grid_clean.csv")
except FileNotFoundError:
    VULN_GRID_CSV = find_one("vulnerability_grid.csv")

print("SHELTERS_CSV:", SHELTERS_CSV)
print("HOSPITALS_CSV:", HOSPITALS_CSV)
print("SCHOOLS_CSV:", SCHOOLS_CSV)
print("VULN_GRID_CSV:", VULN_GRID_CSV)

# sanity check
for p in [SHELTERS_CSV, HOSPITALS_CSV, SCHOOLS_CSV, VULN_GRID_CSV]:
    print(os.path.exists(p), p)


SHELTERS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\shelters.csv
HOSPITALS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\hospitals.csv
SCHOOLS_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\schools.csv
VULN_GRID_CSV: C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\vulnerability_grid_clean.csv
True C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\shelters.csv
True C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\hospitals.csv
True C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\facilities\schools.csv
True C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\vulnerability_grid_clean.csv


In [26]:
import pandas as pd

def pick_latlon_cols(df):
    candidates = [
        ("lat","lon"),
        ("LAT","LON"),
        ("latitude","longitude"),
        ("Latitude","Longitude"),
        ("LATITUDE","LONGITUDE"),
        ("y","x"),
        ("Y","X"),
    ]
    for a,b in candidates:
        if a in df.columns and b in df.columns:
            return a,b
    raise ValueError(f"Could not detect lat/lon columns. Available columns: {list(df.columns)}")

def standardize_facility(df: pd.DataFrame, kind: str) -> pd.DataFrame:
    df = df.copy()
    df["kind"] = kind
    if "id" not in df.columns:
        df["id"] = [f"{kind}_{i}" for i in range(len(df))]

    lat_col, lon_col = pick_latlon_cols(df)
    df["lat"] = pd.to_numeric(df[lat_col], errors="coerce")
    df["lon"] = pd.to_numeric(df[lon_col], errors="coerce")

    return df.dropna(subset=["lat","lon"]).reset_index(drop=True)

def standardize_grid(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "cell_id" not in df.columns:
        df["cell_id"] = [f"cell_{i}" for i in range(len(df))]

    lat_col, lon_col = pick_latlon_cols(df)
    df["lat"] = pd.to_numeric(df[lat_col], errors="coerce")
    df["lon"] = pd.to_numeric(df[lon_col], errors="coerce")

    return df.dropna(subset=["lat","lon"]).reset_index(drop=True)

shelters  = standardize_facility(pd.read_csv(SHELTERS_CSV), "shelter")
hospitals = standardize_facility(pd.read_csv(HOSPITALS_CSV), "hospital")
schools   = standardize_facility(pd.read_csv(SCHOOLS_CSV), "school")
# grid      = standardize_grid(pd.read_csv(VULN_GRID_CSV))
vuln = pd.read_csv(VULN_GRID_CSV, dtype={"FIPS": str})
vuln["FIPS"] = vuln["FIPS"].str.zfill(11)
print("vuln:", vuln.shape)
vuln.head()


# print("grid:", grid.shape, "shelters:", shelters.shape, "hospitals:", hospitals.shape, "schools:", schools.shape)
print(
    "vuln:", vuln.shape,
    "shelters:", shelters.shape,
    "hospitals:", hospitals.shape,
    "schools:", schools.shape
)

# grid.head()



vuln: (5122, 6)
vuln: (5122, 6) shelters: (1978, 83) hospitals: (351, 63) schools: (5381, 59)


## 3) Aggregate facilities into grid nodes (Option A)

In [28]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd


In [30]:
import geopandas as gpd
import pyogrio
import os

GDB_PATH = r"C:\Users\Adrija\Downloads\DFGCN\data\raw\data\raw\vulnerability\SVI2022_FLORIDA_tract.gdb"

layers = pyogrio.list_layers(GDB_PATH)
layer_name = layers[0][0]
print("Using layer:", layer_name)

tracts = gpd.read_file(GDB_PATH, layer=layer_name).to_crs("EPSG:4326")

# detect FIPS column
possible_fips = ["FIPS", "GEOID", "GEOID10", "GEOID20"]
fips_col = next((c for c in possible_fips if c in tracts.columns), None)
if fips_col is None:
    raise ValueError(f"Could not find FIPS/GEOID in columns: {list(tracts.columns)[:50]}")

cent = tracts.copy()
cent["geometry"] = cent.geometry.centroid
cent["lat"] = cent.geometry.y
cent["lon"] = cent.geometry.x

grid = cent[[fips_col, "lat", "lon", "RPL_THEMES", "RPL_THEME1","RPL_THEME2","RPL_THEME3","RPL_THEME4"]].rename(columns={fips_col:"FIPS"}).copy()
grid["FIPS"] = grid["FIPS"].astype(str).str.zfill(11)

print("✅ grid created:", grid.shape)
grid.head()


Using layer: SVI2022_FLORIDA_tract
✅ grid created: (5122, 8)



  cent["geometry"] = cent.geometry.centroid


Unnamed: 0,FIPS,lat,lon,RPL_THEMES,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4
0,12001000201,29.654545,-82.333741,0.2704,0.5495,0.0024,0.4077,0.6982
1,12001000202,29.646142,-82.332971,0.3791,0.7495,0.0006,0.4254,0.7609
2,12001000301,29.666673,-82.331202,0.6544,0.5118,0.3832,0.5423,0.8684
3,12001000302,29.683551,-82.330493,0.4292,0.488,0.3051,0.5136,0.4241
4,12001000400,29.67777,-82.3079,0.7781,0.6217,0.8306,0.6124,0.8108


In [32]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd

SHELTER_CAP_COL = "capacity"
HOSPITAL_BEDS_COL = "beds"
SCHOOL_CAP_COL = "capacity"   # or "students"
R_KM = 20.0

def to_balltree(df: pd.DataFrame) -> BallTree:
    coords = np.deg2rad(df[["lat","lon"]].values)
    return BallTree(coords, metric="haversine")

def ensure_numeric_col(df: pd.DataFrame, col: str, default: float = 0.0) -> pd.DataFrame:
    """
    Guarantee df[col] exists and is numeric.
    If missing, create it with default.
    """
    df = df.copy()
    if col not in df.columns:
        df[col] = default
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(default)
    return df

def aggregate_to_grid(grid_df, shelters_df, hospitals_df, schools_df, R_km=20.0):
    g = grid_df.copy()

    g["shelter_count_nearby"] = 0
    g["shelter_capacity_nearby"] = 0.0
    g["hospital_count_nearby"] = 0
    g["hospital_beds_nearby"] = 0.0
    g["school_count_nearby"] = 0
    g["school_capacity_nearby"] = 0.0

    g_coords = np.deg2rad(g[["lat","lon"]].values)
    R = R_km / 6371.0  # km -> radians

    # Shelters
    if len(shelters_df) > 0:
        s = ensure_numeric_col(shelters_df, SHELTER_CAP_COL, default=0.0)
        tree = to_balltree(s)
        ind = tree.query_radius(g_coords, r=R)
        for i, nbrs in enumerate(ind):
            g.at[i, "shelter_count_nearby"] = int(len(nbrs))
            g.at[i, "shelter_capacity_nearby"] = float(s.iloc[nbrs][SHELTER_CAP_COL].sum())

    # Hospitals
    if len(hospitals_df) > 0:
        h = ensure_numeric_col(hospitals_df, HOSPITAL_BEDS_COL, default=0.0)
        tree = to_balltree(h)
        ind = tree.query_radius(g_coords, r=R)
        for i, nbrs in enumerate(ind):
            g.at[i, "hospital_count_nearby"] = int(len(nbrs))
            g.at[i, "hospital_beds_nearby"] = float(h.iloc[nbrs][HOSPITAL_BEDS_COL].sum())

    # Schools
    if len(schools_df) > 0:
        c = schools_df.copy()
        # choose capacity column if exists, else students, else 0
        if SCHOOL_CAP_COL in c.columns:
            c["__cap__"] = pd.to_numeric(c[SCHOOL_CAP_COL], errors="coerce").fillna(0.0)
        elif "students" in c.columns:
            c["__cap__"] = pd.to_numeric(c["students"], errors="coerce").fillna(0.0)
        else:
            c["__cap__"] = 0.0

        tree = to_balltree(c)
        ind = tree.query_radius(g_coords, r=R)
        for i, nbrs in enumerate(ind):
            g.at[i, "school_count_nearby"] = int(len(nbrs))
            g.at[i, "school_capacity_nearby"] = float(c.iloc[nbrs]["__cap__"].sum())

    return g

grid_aug = aggregate_to_grid(grid, shelters, hospitals, schools, R_km=R_KM)
print("✅ grid_aug:", grid_aug.shape)
grid_aug.head()


✅ grid_aug: (5122, 14)


Unnamed: 0,FIPS,lat,lon,RPL_THEMES,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,shelter_count_nearby,shelter_capacity_nearby,hospital_count_nearby,hospital_beds_nearby,school_count_nearby,school_capacity_nearby
0,12001000201,29.654545,-82.333741,0.2704,0.5495,0.0024,0.4077,0.6982,17,0.0,6,0.0,74,0.0
1,12001000202,29.646142,-82.332971,0.3791,0.7495,0.0006,0.4254,0.7609,17,0.0,6,0.0,74,0.0
2,12001000301,29.666673,-82.331202,0.6544,0.5118,0.3832,0.5423,0.8684,17,0.0,6,0.0,74,0.0
3,12001000302,29.683551,-82.330493,0.4292,0.488,0.3051,0.5136,0.4241,18,0.0,6,0.0,78,0.0
4,12001000400,29.67777,-82.3079,0.7781,0.6217,0.8306,0.6124,0.8108,18,0.0,6,0.0,75,0.0


## 4) Build `nodes.csv` (grid nodes + static features)

In [35]:
nodes = grid_aug.rename(columns={"cell_id": "node_id"}).copy()
nodes["node_type"] = "grid"

# Optional: ensure SVI exists if you used a different name
if "SVI" not in nodes.columns and "vulnerability_score" in nodes.columns:
    nodes["SVI"] = nodes["vulnerability_score"]

nodes_path = os.path.join(OUT_DIR, "nodes.csv")
nodes.to_csv(nodes_path, index=False)
print("Saved:", nodes_path, "rows:", len(nodes))
nodes.head()


Saved: C:\Users\Adrija\Downloads\DFGCN\recovery_dataset_out\nodes.csv rows: 5122


Unnamed: 0,FIPS,lat,lon,RPL_THEMES,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,shelter_count_nearby,shelter_capacity_nearby,hospital_count_nearby,hospital_beds_nearby,school_count_nearby,school_capacity_nearby,node_type
0,12001000201,29.654545,-82.333741,0.2704,0.5495,0.0024,0.4077,0.6982,17,0.0,6,0.0,74,0.0,grid
1,12001000202,29.646142,-82.332971,0.3791,0.7495,0.0006,0.4254,0.7609,17,0.0,6,0.0,74,0.0,grid
2,12001000301,29.666673,-82.331202,0.6544,0.5118,0.3832,0.5423,0.8684,17,0.0,6,0.0,74,0.0,grid
3,12001000302,29.683551,-82.330493,0.4292,0.488,0.3051,0.5136,0.4241,18,0.0,6,0.0,78,0.0,grid
4,12001000400,29.67777,-82.3079,0.7781,0.6217,0.8306,0.6124,0.8108,18,0.0,6,0.0,75,0.0,grid


## 5) Build `edges.csv` using KNN graph

In [43]:
# ===== Required imports (must be run in the current kernel) =====
import numpy as np
import pandas as pd
import os

from sklearn.neighbors import NearestNeighbors


In [44]:
K_NEIGHBORS = 12

def build_knn_edges(nodes_df, k=12):
    coords = np.deg2rad(nodes_df[["lat","lon"]].values)

    nn = NearestNeighbors(
        n_neighbors=min(k + 1, len(nodes_df)),
        metric="haversine"
    )
    nn.fit(coords)

    dist, ind = nn.kneighbors(coords)

    src, dst, distance_km, weight = [], [], [], []

    for i in range(len(nodes_df)):
        for j_idx in range(1, ind.shape[1]):  # skip self-loop
            j = int(ind[i, j_idx])
            d_km = float(dist[i, j_idx]) * 6371.0  # radians → km

            src.append(i)
            dst.append(j)
            distance_km.append(d_km)
            weight.append(1.0 / (d_km + 1e-6))

    return pd.DataFrame({
        "src": src,
        "dst": dst,
        "distance_km": distance_km,
        "weight": weight
    })


In [45]:
nodes = grid_aug.copy()

edges = build_knn_edges(nodes, k=K_NEIGHBORS)

edges_path = os.path.join(OUT_DIR, "edges.csv")
edges.to_csv(edges_path, index=False)

print("Saved:", edges_path)
print("Edges:", len(edges))
edges.head()


Saved: C:\Users\Adrija\Downloads\DFGCN\recovery_dataset_out\edges.csv
Edges: 61464


Unnamed: 0,src,dst,distance_km,weight
0,0,1,0.937335,1.066854
1,0,2,1.370738,0.729534
2,0,12,1.613922,0.619608
3,0,5,1.615375,0.619051
4,0,8,1.635657,0.611375


In [39]:
print(nodes.columns.tolist())


['FIPS', 'lat', 'lon', 'RPL_THEMES', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'shelter_count_nearby', 'shelter_capacity_nearby', 'hospital_count_nearby', 'hospital_beds_nearby', 'school_count_nearby', 'school_capacity_nearby', 'node_type']


## 6) Save DFGCN-style tensors: `X_static.npy`, `edge_index.npy`, `edge_weight.npy`, `node_ids.json`

In [47]:
import json
import numpy as np
import os

# 1) Ensure we have a stable node_id
nodes = nodes.copy()

if "node_id" not in nodes.columns:
    if "cell_id" in nodes.columns:
        nodes["node_id"] = nodes["cell_id"].astype(str)
    elif "FIPS" in nodes.columns:
        nodes["node_id"] = nodes["FIPS"].astype(str)
    else:
        nodes["node_id"] = [f"node_{i}" for i in range(len(nodes))]

# (Optional) Make sure it's unique
if nodes["node_id"].duplicated().any():
    # fall back to guaranteed-unique ids
    nodes["node_id"] = [f"node_{i}" for i in range(len(nodes))]

# 2) Build node_to_idx mapping
node_ids = nodes["node_id"].astype(str).tolist()
node_to_idx = {nid: i for i, nid in enumerate(node_ids)}

with open(os.path.join(OUT_DIR, "node_ids.json"), "w", encoding="utf-8") as f:
    json.dump(node_to_idx, f, indent=2)

# 3) Build X_static (exclude lat/lon, keep numeric features)
numeric_cols = nodes.select_dtypes(include=[np.number]).columns.tolist()
feat_cols = [c for c in numeric_cols if c not in ["lat", "lon"]]

X_static = nodes[feat_cols].fillna(0.0).to_numpy(dtype=np.float32)
np.save(os.path.join(OUT_DIR, "X_static.npy"), X_static)

# 4) Edge arrays
edge_index = np.vstack([
    edges["src"].to_numpy(np.int64),
    edges["dst"].to_numpy(np.int64)
])
edge_weight = edges["weight"].to_numpy(np.float32)

np.save(os.path.join(OUT_DIR, "edge_index.npy"), edge_index)
np.save(os.path.join(OUT_DIR, "edge_weight.npy"), edge_weight)

print("✅ node_id source column used:",
      "cell_id" if "cell_id" in nodes.columns else ("FIPS" if "FIPS" in nodes.columns else "generated"))
print("X_static:", X_static.shape)
print("edge_index:", edge_index.shape, "edge_weight:", edge_weight.shape)
print("Static feature columns sample:", feat_cols[:12])


✅ node_id source column used: FIPS
X_static: (5122, 11)
edge_index: (2, 61464) edge_weight: (61464,)
Static feature columns sample: ['RPL_THEMES', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'shelter_count_nearby', 'shelter_capacity_nearby', 'hospital_count_nearby', 'hospital_beds_nearby', 'school_count_nearby', 'school_capacity_nearby']


In [48]:
print(nodes.columns.tolist())


['FIPS', 'lat', 'lon', 'RPL_THEMES', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'shelter_count_nearby', 'shelter_capacity_nearby', 'hospital_count_nearby', 'hospital_beds_nearby', 'school_count_nearby', 'school_capacity_nearby', 'node_id']


## 7) Optional: Build `X_dynamic.npy` from `weather_ensemble.csv` (T, N, F, E)

In [51]:
import pandas as pd

w = pd.read_csv(WEATHER_ENSEMBLE_CSV)
print(w.columns.tolist())
print(w["node_id"].head(20).tolist())
print("Unique node_id sample:", w["node_id"].astype(str).dropna().unique()[:20])


['t', 'node_id', 'ens', 'wind10m', 'precip', 'mslp']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Unique node_id sample: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15'
 '16' '17' '18' '19']


In [52]:
import pandas as pd

w = pd.read_csv(WEATHER_ENSEMBLE_CSV)

# Make sure your nodes are in the SAME ORDER used when weather was generated
# If weather was generated using the same 'nodes' dataframe rows, this works directly.
idx_to_nodeid = dict(enumerate(nodes["node_id"].astype(str).tolist()))

w["node_id"] = w["node_id"].astype(int).map(idx_to_nodeid)
w = w.dropna(subset=["node_id"])

w.to_csv(WEATHER_ENSEMBLE_CSV, index=False)
print("✅ Rewritten weather_ensemble.csv with tract-FIPS node_id")
print(w["node_id"].head())


✅ Rewritten weather_ensemble.csv with tract-FIPS node_id
0    12001000201
1    12001000201
2    12001000201
3    12001000201
4    12001000201
Name: node_id, dtype: object


In [53]:
import pandas as pd
import numpy as np

def build_X_dynamic(
    weather_csv: str,
    node_to_idx: dict,
    dyn_cols: list
):
    """
    Build dynamic tensor:
    X_dynamic shape = (T, N, F, E)

    T = number of timesteps
    N = number of nodes
    F = number of dynamic features
    E = number of ensemble members
    """

    # -----------------------------
    # Load & basic cleaning
    # -----------------------------
    w = pd.read_csv(weather_csv)

    # Ensure required columns exist
    required = {"t", "ens", "node_id"}
    missing = required - set(w.columns)
    if missing:
        raise ValueError(f"weather_ensemble.csv missing columns: {missing}")

    # Standardize types
    w["node_id"] = w["node_id"].astype(str)
    w["t"] = pd.to_numeric(w["t"], errors="coerce")
    w["ens"] = pd.to_numeric(w["ens"], errors="coerce")

    # Drop invalid rows
    w = w.dropna(subset=["t", "ens", "node_id"])

    # -----------------------------
    # Filter to nodes in graph
    # -----------------------------
    before = len(w)
    w = w[w["node_id"].isin(node_to_idx)]
    after = len(w)

    if after == 0:
        raise ValueError(
            "❌ No matching node IDs between weather_ensemble.csv and graph.\n"
            f"Example graph node_ids: {list(node_to_idx.keys())[:10]}\n"
            f"Example weather node_ids: {w['node_id'].unique()[:10]}\n"
            "→ Fix by aligning weather node_id with graph node_id (FIPS)."
        )

    print(f"✅ Weather rows kept after node-id filter: {after}/{before}")

    # -----------------------------
    # Dimensions
    # -----------------------------
    T = int(w["t"].max()) + 1
    E = int(w["ens"].max()) + 1
    N = len(node_to_idx)
    F = len(dyn_cols)

    # -----------------------------
    # Allocate tensor
    # -----------------------------
    Xd = np.zeros((T, N, F, E), dtype=np.float32)

    # -----------------------------
    # Fill tensor
    # -----------------------------
    for _, r in w.iterrows():
        t = int(r["t"])
        e = int(r["ens"])
        n = node_to_idx[r["node_id"]]

        Xd[t, n, :, e] = np.array(
            [float(r.get(c, 0.0)) for c in dyn_cols],
            dtype=np.float32
        )

    return Xd


In [54]:
DYN_COLS = ["wind10m", "precip", "mslp", "gust"]

X_dynamic = build_X_dynamic(
    WEATHER_ENSEMBLE_CSV,
    node_to_idx,
    DYN_COLS
)

np.save(os.path.join(OUT_DIR, "X_dynamic.npy"), X_dynamic)

print("✅ Saved X_dynamic.npy")
print("Shape (T, N, F, E):", X_dynamic.shape)


✅ Weather rows kept after node-id filter: 48000/48000
✅ Saved X_dynamic.npy
Shape (T, N, F, E): (24, 5122, 4, 10)


## 8) Optional: Build `Y.npy` from `recovery_labels.csv` (T, N)

In [59]:
import pandas as pd
import numpy as np
import os
from sklearn.neighbors import BallTree

Y_COL = "recovery_index"

def build_node_labels_from_facility_labels(
    labels_csv,
    shelters_csv,
    hospitals_csv,
    schools_csv,
    nodes_df,
    node_to_idx,
    out_node_labels_csv,
    agg="mean"   # mean or max
):
    # --- load facility-level labels ---
    y = pd.read_csv(labels_csv)
    y["t"] = pd.to_numeric(y["t"], errors="coerce")
    y[Y_COL] = pd.to_numeric(y[Y_COL], errors="coerce")
    y = y.dropna(subset=["t", Y_COL, "facility_id", "facility_type"]).copy()
    y["facility_id"] = y["facility_id"].astype(str)
    y["facility_type"] = y["facility_type"].astype(str).str.lower()

    # --- load facilities with lat/lon ---
    def load_fac(path, kind):
        df = pd.read_csv(path)
        df = df.copy()
        df["facility_type"] = kind
        # detect id col
        if "facility_id" not in df.columns:
            if "id" in df.columns:
                df["facility_id"] = df["id"].astype(str)
            else:
                df["facility_id"] = [f"{kind}_{i}" for i in range(len(df))]
        df["facility_id"] = df["facility_id"].astype(str)

        # detect lat/lon
        lat_col = "lat" if "lat" in df.columns else ("latitude" if "latitude" in df.columns else None)
        lon_col = "lon" if "lon" in df.columns else ("longitude" if "longitude" in df.columns else None)
        if lat_col is None or lon_col is None:
            raise ValueError(f"{kind} file missing lat/lon columns. Found: {df.columns.tolist()}")

        df["lat"] = pd.to_numeric(df[lat_col], errors="coerce")
        df["lon"] = pd.to_numeric(df[lon_col], errors="coerce")
        df = df.dropna(subset=["lat", "lon"]).copy()

        return df[["facility_id","facility_type","lat","lon"]]

    shelters = load_fac(shelters_csv, "shelter")
    hospitals = load_fac(hospitals_csv, "hospital")
    schools = load_fac(schools_csv, "school")

    fac = pd.concat([shelters, hospitals, schools], ignore_index=True)

    # --- merge labels with facility coords ---
    y = y.merge(fac, on=["facility_id","facility_type"], how="inner")
    if len(y) == 0:
        raise ValueError(
            "After joining labels with facility coordinates, no rows remain.\n"
            "Check that facility_id + facility_type match between recovery_labels.csv and facilities CSVs."
        )

    # --- map facilities -> nearest node ---
    nodes_df = nodes_df.copy()
    if "node_id" not in nodes_df.columns:
        # if you used cell_id or FIPS, make sure node_id exists
        if "cell_id" in nodes_df.columns:
            nodes_df["node_id"] = nodes_df["cell_id"].astype(str)
        elif "FIPS" in nodes_df.columns:
            nodes_df["node_id"] = nodes_df["FIPS"].astype(str)
        else:
            nodes_df["node_id"] = [f"node_{i}" for i in range(len(nodes_df))]

    nodes_df["node_id"] = nodes_df["node_id"].astype(str)

    # Build BallTree for nodes (centroids)
    coords_nodes = np.deg2rad(nodes_df[["lat","lon"]].values)
    tree = BallTree(coords_nodes, metric="haversine")

    coords_fac = np.deg2rad(y[["lat","lon"]].values)
    dist, ind = tree.query(coords_fac, k=1)
    y["node_id"] = nodes_df.iloc[ind.flatten()]["node_id"].astype(str).values

    # --- aggregate facility recovery per (t, node_id) ---
    if agg == "max":
        ynode = y.groupby(["t","node_id"], as_index=False)[Y_COL].max()
    else:
        ynode = y.groupby(["t","node_id"], as_index=False)[Y_COL].mean()

    # filter nodes existing in graph mapping
    ynode = ynode[ynode["node_id"].isin(node_to_idx)].copy()
    if len(ynode) == 0:
        raise ValueError("No node-level labels match node_to_idx after aggregation.")

    ynode.to_csv(out_node_labels_csv, index=False)
    print("✅ Saved node-level labels:", out_node_labels_csv, "rows:", len(ynode))

    return ynode

def build_Y_from_node_labels(node_labels_df, node_to_idx, y_col="recovery_index"):
    # ensure numeric
    node_labels_df["t"] = pd.to_numeric(node_labels_df["t"], errors="coerce")
    node_labels_df[y_col] = pd.to_numeric(node_labels_df[y_col], errors="coerce")
    node_labels_df = node_labels_df.dropna(subset=["t", "node_id", y_col]).copy()
    node_labels_df["node_id"] = node_labels_df["node_id"].astype(str)

    T = int(node_labels_df["t"].max()) + 1
    N = len(node_to_idx)
    Y = np.full((T, N), np.nan, dtype=np.float32)

    for r in node_labels_df.itertuples(index=False):
        t = int(getattr(r, "t"))
        nid = str(getattr(r, "node_id"))
        if nid in node_to_idx:
            Y[t, node_to_idx[nid]] = float(getattr(r, y_col))

    Y = pd.DataFrame(Y).ffill().fillna(0.0).to_numpy(dtype=np.float32)
    return Y

# ---------- RUN ----------
NODE_LABELS_CSV = os.path.join(OUT_DIR, "recovery_labels_node.csv")

ynode = build_node_labels_from_facility_labels(
    labels_csv=RECOVERY_LABELS_CSV,
    shelters_csv=SHELTERS_CSV,
    hospitals_csv=HOSPITALS_CSV,
    schools_csv=SCHOOLS_CSV,
    nodes_df=nodes,
    node_to_idx=node_to_idx,
    out_node_labels_csv=NODE_LABELS_CSV,
    agg="mean"
)

Y = build_Y_from_node_labels(ynode, node_to_idx, y_col=Y_COL)
np.save(os.path.join(OUT_DIR, "Y.npy"), Y)

print("✅ Saved Y.npy:", Y.shape)
print(ynode.head())


ValueError: shelter file missing lat/lon columns. Found: ['OBJECTID_1', 'OBJECTID', 'Name', 'Address', 'City', 'Zip', 'COUNTY', 'X', 'Y', 'LONGLAT', 'USNG', 'FACILITY_T', 'SHELTER_TY', 'RPC_Region', 'Year_Built', 'Building', 'AS_IS_A_Re', 'Generator_', 'PreMitigat', 'PreMitig_1', 'EHPA_Capac', 'EHPA_Squar', 'Retrofit_C', 'Retrofit_S', 'Risk_Capac', 'Risk_Squar', 'Planned_Us', 'FundingSou', 'Evacuation', 'SURGE_ZONE', 'FLOOD_ZONE', 'EHPA', 'General_Po', 'SPECIAL_NE', 'Pet_Friend', 'Does_not_m', 'Survey_Nee', 'Asset_Elevation', 'Owned_Maintained', 'Asset', 'Asset_Sub_Type', 'Asset_Type', 'Asset_Group', 'Asset_Relevancy', 'Asset_ID', 'Municipality', 'Water_Management_District', 'Souce_Dataset_Entity', 'Souce_Dataset_Name', 'Source_Asset_ID', 'Restricted_Public_Disclosure', 'Notes', 'Regional_Planning_Council', 'FDEP_Regulatory_District', 'RCP_Region', 'FDOT_District', 'House_District_Number', 'Senate_District_Number', 'CTF_RL_2020', 'CTF_RL_2040_IL', 'CTF_RL_2040_INT', 'CTF_RL_2070_IL', 'CTF_RL_2070_INT', 'SSF_RL_100YR_CURR', 'SSF_RL_100YR_2040_IL', 'SSF_RL_100YR_2040_INT', 'SSF_RL_100YR_2070_IL', 'SSF_RL_100YR_2070_INT', 'SSF_RL_500YR_CURR', 'SSF_RL_500YR_2040_IL', 'SSF_RL_500YR_2040_INT', 'SSF_RL_500YR_2070_IL', 'SSF_RL_500YR_2070_INT', 'RIF_RL_100YR_2020', 'RIF_RL_100YR_2040', 'RIF_RL_100YR_2070', 'RIF_RL_500YR_2020', 'RIF_RL_500YR_2040', 'RIF_RL_500YR_2070', 'facility_type', 'facility_id']

In [61]:
import pandas as pd
import numpy as np
import re

def load_fac(path, kind):
    df = pd.read_csv(path).copy()
    df["facility_type"] = kind

    # --- ensure facility_id exists ---
    if "facility_id" not in df.columns:
        if "id" in df.columns:
            df["facility_id"] = df["id"].astype(str)
        elif "Asset_ID" in df.columns:
            df["facility_id"] = df["Asset_ID"].astype(str)
        else:
            df["facility_id"] = [f"{kind}_{i}" for i in range(len(df))]
    df["facility_id"] = df["facility_id"].astype(str)

    # --- detect coordinate formats ---
    # Case 1: already lat/lon
    if "lat" in df.columns and "lon" in df.columns:
        df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
        df["lon"] = pd.to_numeric(df["lon"], errors="coerce")

    # Case 2: latitude/longitude
    elif "latitude" in df.columns and "longitude" in df.columns:
        df["lat"] = pd.to_numeric(df["latitude"], errors="coerce")
        df["lon"] = pd.to_numeric(df["longitude"], errors="coerce")

    # Case 3: X/Y (most GIS exports: X=lon, Y=lat)
    elif "X" in df.columns and "Y" in df.columns:
        df["lon"] = pd.to_numeric(df["X"], errors="coerce")
        df["lat"] = pd.to_numeric(df["Y"], errors="coerce")

    # Case 4: LONGLAT string
    elif "LONGLAT" in df.columns:
        def parse_lonlat(v):
            if pd.isna(v):
                return (np.nan, np.nan)
            s = str(v).strip()
            # remove parentheses
            s = s.replace("(", "").replace(")", "")
            # split by comma or space
            parts = re.split(r"[,\s]+", s)
            parts = [p for p in parts if p != ""]
            if len(parts) < 2:
                return (np.nan, np.nan)
            lon = pd.to_numeric(parts[0], errors="coerce")
            lat = pd.to_numeric(parts[1], errors="coerce")
            return (lat, lon)

        latlon = df["LONGLAT"].apply(parse_lonlat)
        df["lat"] = latlon.apply(lambda x: x[0])
        df["lon"] = latlon.apply(lambda x: x[1])

    else:
        raise ValueError(
            f"{kind} file missing coordinates.\n"
            f"Expected lat/lon OR latitude/longitude OR X/Y OR LONGLAT.\n"
            f"Found columns: {df.columns.tolist()}"
        )

    # drop invalid coords
    df = df.dropna(subset=["lat", "lon"]).copy()

    # sanity filter (Florida bounds-ish, optional)
    df = df[(df["lat"].between(24, 32)) & (df["lon"].between(-88, -79))].copy()

    return df[["facility_id", "facility_type", "lat", "lon"]]


In [62]:
Y_COL = "recovery_index"  # edit if your column name differs

def build_Y(labels_csv, node_to_idx, y_col):
    ydf = pd.read_csv(labels_csv)
    ydf["node_id"] = ydf["node_id"].astype(str)
    ydf = ydf[ydf["node_id"].isin(node_to_idx)].copy()

    T = int(ydf["t"].max()) + 1
    N = len(node_to_idx)
    Y = np.full((T, N), np.nan, dtype=np.float32)

    for _, r in ydf.iterrows():
        t = int(r["t"])
        n = node_to_idx[r["node_id"]]
        Y[t, n] = float(r[y_col])

    Y = pd.DataFrame(Y).fillna(method="ffill").fillna(0.0).to_numpy(dtype=np.float32)
    return Y

if os.path.exists(RECOVERY_LABELS_CSV):
    Y = build_Y(RECOVERY_LABELS_CSV, node_to_idx, Y_COL)
    np.save(os.path.join(OUT_DIR, "Y.npy"), Y)
    print("Saved Y.npy:", Y.shape)
else:
    print("recovery_labels.csv not found. Upload it and rerun this cell.")


KeyError: 'node_id'

## 9) Verify outputs

In [63]:
print("Files in OUT_DIR:")
for fn in sorted(os.listdir(OUT_DIR)):
    print(" -", fn)

X_static_chk = np.load(os.path.join(OUT_DIR, "X_static.npy"))
edge_index_chk = np.load(os.path.join(OUT_DIR, "edge_index.npy"))
edge_weight_chk = np.load(os.path.join(OUT_DIR, "edge_weight.npy"))

print("\nShapes:")
print("X_static:", X_static_chk.shape)
print("edge_index:", edge_index_chk.shape)
print("edge_weight:", edge_weight_chk.shape)


Files in OUT_DIR:
 - X_dynamic.npy
 - X_static.npy
 - edge_index.npy
 - edge_weight.npy
 - edges.csv
 - node_ids.json
 - nodes.csv

Shapes:
X_static: (5122, 11)
edge_index: (2, 61464)
edge_weight: (61464,)
