In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn as sk
import numpy as np

In [2]:
flood_df = pd.read_csv('../data/cleaned_flood_data.csv')
non_flood_df = pd.read_csv('../data/cleaned_non_flood_data.csv')

In [3]:
full_df = pd.concat([flood_df, non_flood_df])
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18616 entries, 0 to 9307
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          18616 non-null  int64  
 1   MONTH                         18616 non-null  int64  
 2   BEGIN_TIME                    18616 non-null  int64  
 3   BEGIN_LAT                     18616 non-null  float64
 4   BEGIN_LON                     18616 non-null  float64
 5   STATE                         18616 non-null  object 
 6   EVENT_TYPE                    18616 non-null  object 
 7   FLOOD_CAUSE                   8782 non-null   object 
 8   EVENT_NARRATIVE               9215 non-null   object 
 9   temperature_2m_mean           18616 non-null  float64
 10  wind_speed_10m_mean           18616 non-null  float64
 11  cloud_cover_mean              18616 non-null  float64
 12  relative_humidity_2m_mean     18616 non-null  float64
 13  dew_poi

In [4]:
full_df = full_df.drop(['precipitation_sum', 'YEAR', 'MONTH', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], axis=1)

In [5]:
for column in full_df.columns:
    num_na = full_df[full_df[column].isna()].shape[0]
    print(f"For column {column} there are {num_na} missing values.")

For column BEGIN_TIME there are 0 missing values.
For column BEGIN_LAT there are 0 missing values.
For column BEGIN_LON there are 0 missing values.
For column EVENT_TYPE there are 0 missing values.
For column temperature_2m_mean there are 0 missing values.
For column wind_speed_10m_mean there are 0 missing values.
For column cloud_cover_mean there are 0 missing values.
For column relative_humidity_2m_mean there are 0 missing values.
For column dew_point_2m_mean there are 0 missing values.
For column rain_sum there are 0 missing values.
For column pressure_msl_mean there are 0 missing values.
For column soil_moisture_0_to_10cm_mean there are 0 missing values.
For column elevation there are 0 missing values.
For column is_primary_rain_season there are 0 missing values.
For column is_secondary_rain_season there are 0 missing values.
For column Flood_Zone_A there are 0 missing values.
For column Flood_Zone_AE there are 0 missing values.
For column Flood_Zone_AH there are 0 missing values.


In [6]:
X = full_df.drop(columns=["EVENT_TYPE"])
y = full_df["EVENT_TYPE"]

le = sk.preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y
)

In [9]:
# ===== Block 21: KNN + PCA + StandardScaler (model testing) =====

from sklearn.experimental import enable_halving_search_cv
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = sk.neighbors.KNeighborsClassifier()

pipeline_knn = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("pca", pca),
        ("classifier", knn),
    ]
)

param_grid_knn = {
    "pca__n_components": list(range(5, 20)),
    "classifier__n_neighbors": list(range(1, 26)),
}
knn_grid = sk.model_selection.HalvingGridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)
scores = sk.model_selection.cross_val_score(
    estimator=knn_grid,
    X=X_train,
    y=y_train,
    cv=5,
    verbose=3
)
print(f"KNN + PCA: average accuracy = {scores.mean():.4f}")

knn_grid.fit(X_train, y_train)
print("Best accuracy (KNN+PCA):", knn_grid.best_score_)
print("Best parameters (KNN+PCA):", knn_grid.best_params_)

[CV] END ................................ score: (test=0.620) total time=   3.8s
[CV] END ................................ score: (test=0.676) total time=   3.3s
[CV] END ................................ score: (test=0.620) total time=   3.4s
[CV] END ................................ score: (test=0.621) total time=   3.6s
[CV] END ................................ score: (test=0.735) total time=   3.1s
KNN + PCA: average accuracy = 0.6545


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.1s finished


Best accuracy (KNN+PCA): 0.7266907670005212
Best parameters (KNN+PCA): {'classifier__n_neighbors': 1, 'pca__n_components': 18}


In [9]:
scaler = sk.preprocessing.StandardScaler()
mlp = sk.neural_network.MLPClassifier(max_iter=5000, random_state=42)

pipeline_mlp = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("classifier", mlp),
    ]
)

param_grid_mlp = {
    "classifier__hidden_layer_sizes": [(100, 50), (200, 100), (100, 50, 25)],
    "classifier__activation": ["tanh"],
    "classifier__solver": ["adam"],
    "classifier__alpha": [0.01, 0.05],
    "classifier__learning_rate_init": [0.001],
}

mlp_grid = sk.model_selection.GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid_mlp,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1,
)

mlp_grid.fit(X_train, y_train)

mlp_results = pd.DataFrame(mlp_grid.cv_results_)
cols_to_keep = [
    "param_classifier__hidden_layer_sizes",
    "param_classifier__alpha",
    "mean_test_score",
]
print(
    mlp_results[cols_to_keep]
    .sort_values("mean_test_score", ascending=False)
)

print("Best parameters (MLP):", mlp_grid.best_params_)
print("Best CV accuracy (MLP):", mlp_grid.best_score_)

nested_scores = sk.model_selection.cross_val_score(
    estimator=mlp_grid,
    X=X_train,
    y=y_train,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1
)

final_accuracy = np.mean(nested_scores)
print(f"\nNested CV Mean Accuracy: {final_accuracy:.4f}")
print(f"Individual Outer Fold Scores: {nested_scores}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
  param_classifier__hidden_layer_sizes  param_classifier__alpha  \
4                           (200, 100)                     0.05   
1                           (200, 100)                     0.01   
5                        (100, 50, 25)                     0.05   
2                        (100, 50, 25)                     0.01   
3                            (100, 50)                     0.05   
0                            (100, 50)                     0.01   

   mean_test_score  
4         0.769003  
1         0.763497  
5         0.753693  
2         0.752082  
3         0.751746  
0         0.743957  
Best parameters (MLP): {'classifier__activation': 'tanh', 'classifier__alpha': 0.05, 'classifier__hidden_layer_sizes': (200, 100), 'classifier__learning_rate_init': 0.001, 'classifier__solver': 'adam'}
Best CV accuracy (MLP): 0.7690027190557768


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


[CV 2/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(200, 100), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.754 total time=  28.4s
[CV 5/5] END classifier__activation=tanh, classifier__alpha=0.05, classifier__hidden_layer_sizes=(100, 50), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.746 total time=  18.7s
[CV 1/5] END classifier__activation=tanh, classifier__alpha=0.05, classifier__hidden_layer_sizes=(100, 50, 25), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.758 total time=  15.4s
[CV 1/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(200, 100), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.773 total time=  26.5s
[CV 3/5] END classifier__activation=tanh, classifier__alpha=0.05, classifier__hidden_layer_sizes=(100, 50), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.76

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 10.6min remaining: 15.9min



Nested CV Mean Accuracy: 0.7675
Individual Outer Fold Scores: [0.7734139  0.76837865 0.77602418 0.75453324 0.76494291]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min finished


[CV 3/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(100, 50), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.759 total time=  21.9s
[CV 3/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(100, 50, 25), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.758 total time=  16.3s
[CV 3/5] END classifier__activation=tanh, classifier__alpha=0.05, classifier__hidden_layer_sizes=(200, 100), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.776 total time=  26.3s
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 2/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(100, 50), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.724 total time= 2.9min
[CV 5/5] END classifier__activation=tanh, classifier__alpha=0.01, classifier__hidden_layer_sizes=(100, 50), classifier__lea

In [10]:
print("Best Hyperparameters:", mlp_grid.best_params_)

Best Hyperparameters: {'classifier__activation': 'tanh', 'classifier__alpha': 0.05, 'classifier__hidden_layer_sizes': (200, 100), 'classifier__learning_rate_init': 0.001, 'classifier__solver': 'adam'}
