In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn as sk
import numpy as np

In [45]:
flood_df = pd.read_csv('../data/cleaned_flood_data.csv')
non_flood_df = pd.read_csv('../data/cleaned_non_flood_data.csv')

In [46]:
full_df = pd.concat([flood_df, non_flood_df])
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18616 entries, 0 to 9307
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          18616 non-null  int64  
 1   MONTH                         18616 non-null  int64  
 2   BEGIN_TIME                    18616 non-null  int64  
 3   BEGIN_LAT                     18616 non-null  float64
 4   BEGIN_LON                     18616 non-null  float64
 5   STATE                         18616 non-null  object 
 6   EVENT_TYPE                    18616 non-null  object 
 7   FLOOD_CAUSE                   8782 non-null   object 
 8   EVENT_NARRATIVE               9215 non-null   object 
 9   temperature_2m_mean           18616 non-null  float64
 10  wind_speed_10m_mean           18616 non-null  float64
 11  cloud_cover_mean              18616 non-null  float64
 12  relative_humidity_2m_mean     18616 non-null  float64
 13  dew_poi

In [47]:
full_df = full_df.drop(['precipitation_sum', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], axis=1)

In [48]:
for column in full_df.columns:
    num_na = full_df[full_df[column].isna()].shape[0]
    print(f"For column {column} there are {num_na} missing values.")

For column YEAR there are 0 missing values.
For column MONTH there are 0 missing values.
For column BEGIN_TIME there are 0 missing values.
For column BEGIN_LAT there are 0 missing values.
For column BEGIN_LON there are 0 missing values.
For column EVENT_TYPE there are 0 missing values.
For column temperature_2m_mean there are 0 missing values.
For column wind_speed_10m_mean there are 0 missing values.
For column cloud_cover_mean there are 0 missing values.
For column relative_humidity_2m_mean there are 0 missing values.
For column dew_point_2m_mean there are 0 missing values.
For column rain_sum there are 0 missing values.
For column pressure_msl_mean there are 0 missing values.
For column soil_moisture_0_to_10cm_mean there are 0 missing values.
For column elevation there are 0 missing values.


In [49]:
X = full_df.drop(columns=["EVENT_TYPE"])
y = full_df["EVENT_TYPE"]

le = sk.preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y
)

In [56]:
scaler = sk.preprocessing.StandardScaler()
mlp = sk.neural_network.MLPClassifier(max_iter=5000, random_state=42)

pipeline_mlp = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("classifier", mlp),
    ]
)

param_grid_mlp = {
    "classifier__hidden_layer_sizes": [(10,), (15,), (20,)],
    "classifier__activation": ["logistic"],
    "classifier__solver": ["adam"],
    "classifier__alpha": [0.001, 0.01, 0.05],
    "classifier__learning_rate_init": [0.001],
}

mlp_grid = sk.model_selection.GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid_mlp,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1,
)

mlp_grid.fit(X_train, y_train)

mlp_results = pd.DataFrame(mlp_grid.cv_results_)
cols_to_keep = [
    "param_classifier__hidden_layer_sizes",
    "param_classifier__alpha",
    "mean_test_score",
]
print(
    mlp_results[cols_to_keep]
    .sort_values("mean_test_score", ascending=False)
)

print("Best parameters (MLP):", mlp_grid.best_params_)
print("Best CV accuracy (MLP):", mlp_grid.best_score_)

nested_scores = sk.model_selection.cross_val_score(
    estimator=mlp_grid,
    X=X_train,
    y=y_train,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1
)

final_accuracy = -np.mean(nested_scores)
print(f"\nNested CV Mean Accuracy: {final_accuracy:.4f}")
print(f"Individual Outer Fold Scores: {nested_scores}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
  param_classifier__hidden_layer_sizes  param_classifier__alpha  \
2                                (20,)                    0.001   
5                                (20,)                    0.010   
8                                (20,)                    0.050   
1                                (15,)                    0.001   
4                                (15,)                    0.010   
7                                (15,)                    0.050   
0                                (10,)                    0.001   
3                                (10,)                    0.010   
6                                (10,)                    0.050   

   mean_test_score  
2         0.620602  
5         0.615028  
8         0.601195  
1         0.586826  
4         0.586155  
7         0.579909  
0         0.563321  
3         0.560703  
6         0.553653  
Best parameters (MLP): {'classifier__activation': 'logistic

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


[CV 3/5] END classifier__activation=logistic, classifier__alpha=0.001, classifier__hidden_layer_sizes=(10,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.558 total time=   4.1s
[CV 3/5] END classifier__activation=logistic, classifier__alpha=0.001, classifier__hidden_layer_sizes=(20,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.624 total time=   8.5s
[CV 3/5] END classifier__activation=logistic, classifier__alpha=0.01, classifier__hidden_layer_sizes=(20,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.600 total time=   6.4s
[CV 1/5] END classifier__activation=logistic, classifier__alpha=0.05, classifier__hidden_layer_sizes=(15,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.590 total time=   6.4s
[CV 1/5] END classifier__activation=logistic, classifier__alpha=0.001, classifier__hidden_layer_sizes=(15,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.599 t

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  9.3min remaining: 14.0min



Nested CV Mean Accuracy: -0.6141
Individual Outer Fold Scores: [0.62336354 0.61866398 0.60006716 0.61652116 0.61182001]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.6min finished


[CV 5/5] END classifier__activation=logistic, classifier__alpha=0.001, classifier__hidden_layer_sizes=(15,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.586 total time=   6.7s
[CV 5/5] END classifier__activation=logistic, classifier__alpha=0.01, classifier__hidden_layer_sizes=(10,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.552 total time=   4.7s
[CV 5/5] END classifier__activation=logistic, classifier__alpha=0.01, classifier__hidden_layer_sizes=(20,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.623 total time=   7.3s
[CV 3/5] END classifier__activation=logistic, classifier__alpha=0.05, classifier__hidden_layer_sizes=(20,), classifier__learning_rate_init=0.001, classifier__solver=adam;, score=0.588 total time=   4.8s
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END classifier__activation=logistic, classifier__alpha=0.001, classifier__hidden_layer_sizes=(10,), classifier__learni