In [1]:
# Cell 1 – Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib


In [2]:
# Cell 2 – Load Dataset
df = pd.read_csv("../data/interim/StormEvents_cleaned.csv")

# If MONTH not present, derive it from MONTH_NAME
if "MONTH" not in df.columns:
    if "MONTH_NAME" in df.columns:
        month_map = {
            "JANUARY": 1, "FEBRUARY": 2, "MARCH": 3, "APRIL": 4,
            "MAY": 5, "JUNE": 6, "JULY": 7, "AUGUST": 8,
            "SEPTEMBER": 9, "OCTOBER": 10, "NOVEMBER": 11, "DECEMBER": 12
        }
        df["MONTH"] = df["MONTH_NAME"].astype(str).str.upper().map(month_map)

# If SEASON not present, derive from MONTH
if "SEASON" not in df.columns:
    def get_season(m):
        if m in [12,1,2]: return "DJF"
        elif m in [3,4,5]: return "MAM"
        elif m in [6,7,8]: return "JJA"
        elif m in [9,10,11]: return "SON"
        return "UNK"
    df["SEASON"] = df["MONTH"].apply(get_season)

# Define features and target
FEATURES = ["EVENT_TYPE", "STATE", "MONTH", "SEASON", 
            "MAGNITUDE", "MAGNITUDE_TYPE", "BEGIN_LAT", "BEGIN_LON"]

X = df[FEATURES]
y = df[["DAMAGE_PROPERTY", "DAMAGE_CROPS"]]

print("✅ Features shape:", X.shape)
print("✅ Targets shape:", y.shape)


✅ Features shape: (33904, 8)
✅ Targets shape: (33904, 2)


In [3]:
# Cell 3 – Define Preprocessor
categorical_features = ["EVENT_TYPE", "STATE", "SEASON", "MAGNITUDE_TYPE"]
numeric_features = ["MONTH", "MAGNITUDE", "BEGIN_LAT", "BEGIN_LON"]

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0, keep_empty_features=True)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", categorical_transformer, categorical_features),
    ("num", numeric_transformer, numeric_features)
])


In [4]:
# Cell 4 – Define Models
models = {
    "LinearRegression": MultiOutputRegressor(LinearRegression()),
    "RandomForest": MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
    "GradientBoosting": MultiOutputRegressor(GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42)),
    "ExtraTrees": MultiOutputRegressor(ExtraTreesRegressor(n_estimators=300, random_state=42, n_jobs=-1))
}


In [5]:
# Cell 5 – Training & Evaluation
results = {}
best_model = None
best_score = -np.inf

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, regressor in models.items():
    print(f"\n🔹 Training {name}...")
    
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    metrics = {}
    r2_scores = []
    for i, target in enumerate(y.columns):
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
        metrics[target] = {"RMSE": rmse, "MAE": mae, "R2": r2}
        r2_scores.append(r2)
        print(f"{name} - {target}: RMSE={rmse:.2f}, MAE={mae:.2f}, R²={r2:.3f}")

    avg_r2 = np.mean(r2_scores)
    results[name] = {"metrics": metrics, "avg_r2": avg_r2}

    if avg_r2 > best_score:
        best_score = avg_r2
        best_model = (name, pipe)

print("\n✅ Best model selected:", best_model[0])



🔹 Training LinearRegression...
LinearRegression - DAMAGE_PROPERTY: RMSE=63934.48, MAE=14892.13, R²=0.960
LinearRegression - DAMAGE_CROPS: RMSE=29892.94, MAE=10030.54, R²=0.915

🔹 Training RandomForest...
RandomForest - DAMAGE_PROPERTY: RMSE=71349.89, MAE=19091.73, R²=0.950
RandomForest - DAMAGE_CROPS: RMSE=33311.86, MAE=10181.25, R²=0.894

🔹 Training GradientBoosting...
GradientBoosting - DAMAGE_PROPERTY: RMSE=64823.30, MAE=15880.16, R²=0.959
GradientBoosting - DAMAGE_CROPS: RMSE=30133.42, MAE=10076.04, R²=0.913

🔹 Training ExtraTrees...
ExtraTrees - DAMAGE_PROPERTY: RMSE=74494.37, MAE=19332.80, R²=0.946
ExtraTrees - DAMAGE_CROPS: RMSE=36042.54, MAE=10319.57, R²=0.876

✅ Best model selected: LinearRegression


In [6]:
# Cell 6 – Save Final Model
final_name, final_pipeline = best_model

joblib.dump(final_pipeline, "../models/storm_damage_model.joblib")

print(f"🌟 Final pipeline saved as storm_damage_model.joblib (Best: {final_name}, Avg R² = {best_score:.3f})")


🌟 Final pipeline saved as storm_damage_model.joblib (Best: LinearRegression, Avg R² = 0.937)


In [7]:
# Cell 7 – Compare Results
results_df = pd.DataFrame({
    (model, target): results[model]["metrics"][target]
    for model in results
    for target in results[model]["metrics"]
}).T

print("\n📊 Model Comparison:")
display(results_df)



📊 Model Comparison:


Unnamed: 0,Unnamed: 1,RMSE,MAE,R2
LinearRegression,DAMAGE_PROPERTY,63934.481811,14892.126413,0.960003
LinearRegression,DAMAGE_CROPS,29892.93576,10030.537934,0.91482
RandomForest,DAMAGE_PROPERTY,71349.894537,19091.734714,0.950187
RandomForest,DAMAGE_CROPS,33311.860758,10181.252403,0.894221
GradientBoosting,DAMAGE_PROPERTY,64823.299531,15880.15962,0.958883
GradientBoosting,DAMAGE_CROPS,30133.420191,10076.039946,0.913444
ExtraTrees,DAMAGE_PROPERTY,74494.374949,19332.801744,0.945699
ExtraTrees,DAMAGE_CROPS,36042.537534,10319.565547,0.876169
