In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib

BASE = Path("..")
DATA_PROCESSED = BASE / "data" / "processed"
OUT_DIR = BASE / "data" / "interim"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("✅ Paths ready")


✅ Paths ready


In [2]:
clean_file = list(DATA_PROCESSED.glob("cleaned.parquet")) or list(DATA_PROCESSED.glob("cleaned.csv"))
assert clean_file, "Run preprocessing first!"
df = pd.read_parquet(clean_file[0]) if clean_file[0].suffix == ".parquet" else pd.read_csv(clean_file[0])

print("Rows:", len(df))
df.head()


Rows: 19098


Unnamed: 0,event_type,state,month,season,magnitude,magnitude_type,begin_lat,begin_lon,damage_property_num
0,Thunderstorm Wind,GEORGIA,3,MAM,52.0,EG,33.4757,-85.238,1000.0
1,Tornado,MICHIGAN,3,MAM,50.0,Unknown,41.79,-86.1,100000.0
2,Flash Flood,TENNESSEE,4,MAM,50.0,Unknown,36.03,-89.33,0.0
3,Thunderstorm Wind,TENNESSEE,4,MAM,52.0,EG,36.18,-88.16,0.0
4,Flash Flood,TENNESSEE,4,MAM,50.0,Unknown,36.3,-88.71,0.0


In [3]:
# US regions mapping (simplified)
state_to_region = {
    "TEXAS":"South", "FLORIDA":"South", "LOUISIANA":"South", "ALABAMA":"South", "GEORGIA":"South",
    "ILLINOIS":"Midwest", "OHIO":"Midwest", "MICHIGAN":"Midwest", "MINNESOTA":"Midwest",
    "NEW YORK":"Northeast", "PENNSYLVANIA":"Northeast", "MASSACHUSETTS":"Northeast",
    "CALIFORNIA":"West", "OREGON":"West", "WASHINGTON":"West", "NEVADA":"West", "ARIZONA":"West",
}

df["region"] = df["state"].map(state_to_region).fillna("Other")
df[["state","region"]].head(10)


Unnamed: 0,state,region
0,GEORGIA,South
1,MICHIGAN,Midwest
2,TENNESSEE,Other
3,TENNESSEE,Other
4,TENNESSEE,Other
5,TENNESSEE,Other
6,TENNESSEE,Other
7,NEW YORK,Northeast
8,TENNESSEE,Other
9,MISSISSIPPI,Other


In [9]:
# Lat/Lon binning into 5-degree cells, convert to string labels
df["lat_bin"] = pd.cut(df["begin_lat"], bins=np.arange(-90, 95, 5)).astype(str)
df["lon_bin"] = pd.cut(df["begin_lon"], bins=np.arange(-180, 185, 5)).astype(str)

df[["begin_lat","lat_bin","begin_lon","lon_bin"]].head()



Unnamed: 0,begin_lat,lat_bin,begin_lon,lon_bin
0,33.4757,"(30, 35]",-85.238,"(-90, -85]"
1,41.79,"(40, 45]",-86.1,"(-90, -85]"
2,36.03,"(35, 40]",-89.33,"(-90, -85]"
3,36.18,"(35, 40]",-88.16,"(-90, -85]"
4,36.3,"(35, 40]",-88.71,"(-90, -85]"


In [5]:
# Keep only top 15 event types, others → "Other"
top_events = df["event_type"].value_counts().head(15).index
df["event_group"] = df["event_type"].where(df["event_type"].isin(top_events), "Other")

df[["event_type","event_group"]].head(20)


Unnamed: 0,event_type,event_group
0,Thunderstorm Wind,Thunderstorm Wind
1,Tornado,Tornado
2,Flash Flood,Flash Flood
3,Thunderstorm Wind,Thunderstorm Wind
4,Flash Flood,Flash Flood
5,Flash Flood,Flash Flood
6,Flash Flood,Flash Flood
7,Hail,Hail
8,Thunderstorm Wind,Thunderstorm Wind
9,Thunderstorm Wind,Thunderstorm Wind


In [6]:
# Combine event_type and season (categorical interaction)
df["event_season"] = df["event_group"].astype(str) + "_" + df["season"].astype(str)

# Combine state and event type (optional, can be sparse)
df["state_event"] = df["state"].astype(str) + "_" + df["event_group"].astype(str)

df[["event_group","season","event_season"]].head(20)


Unnamed: 0,event_group,season,event_season
0,Thunderstorm Wind,MAM,Thunderstorm Wind_MAM
1,Tornado,MAM,Tornado_MAM
2,Flash Flood,MAM,Flash Flood_MAM
3,Thunderstorm Wind,MAM,Thunderstorm Wind_MAM
4,Flash Flood,MAM,Flash Flood_MAM
5,Flash Flood,MAM,Flash Flood_MAM
6,Flash Flood,MAM,Flash Flood_MAM
7,Hail,MAM,Hail_MAM
8,Thunderstorm Wind,MAM,Thunderstorm Wind_MAM
9,Thunderstorm Wind,MAM,Thunderstorm Wind_MAM


In [7]:
# Average historical damage by state-event
avg_damage = df.groupby(["state","event_group"])["damage_property_num"].mean().to_dict()
df["state_event_avg_damage"] = df.set_index(["state","event_group"]).index.map(avg_damage)

df[["state","event_group","state_event_avg_damage"]].head(20)


Unnamed: 0,state,event_group,state_event_avg_damage
0,GEORGIA,Thunderstorm Wind,2487.808896
1,MICHIGAN,Tornado,18214.285714
2,TENNESSEE,Flash Flood,14266.428571
3,TENNESSEE,Thunderstorm Wind,9633.411765
4,TENNESSEE,Flash Flood,14266.428571
5,TENNESSEE,Flash Flood,14266.428571
6,TENNESSEE,Flash Flood,14266.428571
7,NEW YORK,Hail,0.0
8,TENNESSEE,Thunderstorm Wind,9633.411765
9,MISSISSIPPI,Thunderstorm Wind,14138.851351


In [10]:
out_file = OUT_DIR / "features_extended.parquet"
df.to_parquet(out_file, index=False)
print("✅ Extended features saved ->", out_file)


✅ Extended features saved -> ..\data\interim\features_extended.parquet
