In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn as sk
import numpy as np

In [2]:
flood_df = pd.read_csv('../data/cleaned_flood_data.csv')
non_flood_df = pd.read_csv('../data/cleaned_non_flood_data.csv')

In [3]:
full_df = pd.concat([flood_df, non_flood_df])
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18616 entries, 0 to 9307
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          18616 non-null  int64  
 1   MONTH                         18616 non-null  int64  
 2   BEGIN_TIME                    18616 non-null  int64  
 3   BEGIN_LAT                     18616 non-null  float64
 4   BEGIN_LON                     18616 non-null  float64
 5   STATE                         18616 non-null  object 
 6   EVENT_TYPE                    18616 non-null  object 
 7   FLOOD_CAUSE                   8782 non-null   object 
 8   EVENT_NARRATIVE               9215 non-null   object 
 9   temperature_2m_mean           18616 non-null  float64
 10  wind_speed_10m_mean           18616 non-null  float64
 11  cloud_cover_mean              18616 non-null  float64
 12  relative_humidity_2m_mean     18616 non-null  float64
 13  dew_poi

In [4]:
full_df = full_df.drop(['precipitation_sum', 'YEAR', 'MONTH', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], axis=1)

In [5]:
for column in full_df.columns:
    num_na = full_df[full_df[column].isna()].shape[0]
    print(f"For column {column} there are {num_na} missing values.")

For column BEGIN_TIME there are 0 missing values.
For column BEGIN_LAT there are 0 missing values.
For column BEGIN_LON there are 0 missing values.
For column EVENT_TYPE there are 0 missing values.
For column temperature_2m_mean there are 0 missing values.
For column wind_speed_10m_mean there are 0 missing values.
For column cloud_cover_mean there are 0 missing values.
For column relative_humidity_2m_mean there are 0 missing values.
For column dew_point_2m_mean there are 0 missing values.
For column rain_sum there are 0 missing values.
For column pressure_msl_mean there are 0 missing values.
For column soil_moisture_0_to_10cm_mean there are 0 missing values.
For column elevation there are 0 missing values.
For column is_primary_rain_season there are 0 missing values.
For column is_secondary_rain_season there are 0 missing values.
For column Flood_Zone_A there are 0 missing values.
For column Flood_Zone_AE there are 0 missing values.
For column Flood_Zone_AH there are 0 missing values.


In [31]:
X = full_df.drop(columns=["EVENT_TYPE"])

def convert_event_type(row):
    if row == 'Flash Flood':
        row = 'Flood'
    return row

y = full_df['EVENT_TYPE'].apply(convert_event_type)

le = sk.preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y
)

In [36]:
# ===== Block 21: KNN + PCA + StandardScaler (model testing) =====

from sklearn.experimental import enable_halving_search_cv
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = sk.neighbors.KNeighborsClassifier()

pipeline_knn = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("pca", pca),
        ("classifier", knn),
    ]
)

param_grid_knn = {
    "pca__n_components": list(range(5, 20)),
    "classifier__n_neighbors": list(range(1, 26)),
}
knn_grid = sk.model_selection.HalvingGridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)
scores = sk.model_selection.cross_val_score(
    estimator=knn_grid,
    X=X_train,
    y=y_train,
    cv=5,
    verbose=3
)
print(f"KNN + PCA: average accuracy = {scores.mean():.4f}")

knn_grid.fit(X_train, y_train)
print("Best accuracy (KNN+PCA):", knn_grid.best_score_)
print("Best parameters (KNN+PCA):", knn_grid.best_params_)

[CV] END ................................ score: (test=0.682) total time=   5.3s
[CV] END ................................ score: (test=0.782) total time=   3.2s
[CV] END ................................ score: (test=0.665) total time=   3.5s
[CV] END ................................ score: (test=0.753) total time=   3.3s
[CV] END ................................ score: (test=0.775) total time=   3.4s
KNN + PCA: average accuracy = 0.7313


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.7s finished


Best accuracy (KNN+PCA): 0.7333026560434034
Best parameters (KNN+PCA): {'classifier__n_neighbors': 1, 'pca__n_components': 6}


In [None]:
scaler = sk.preprocessing.StandardScaler()
mlp = sk.neural_network.MLPClassifier(max_iter=5000, random_state=42)

pipeline_mlp = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("classifier", mlp),
    ]
)

param_grid_mlp = {
    "classifier__hidden_layer_sizes": [(200, 100)],
    "classifier__activation": ["tanh"],
    "classifier__solver": ["adam"],
    "classifier__alpha": [0.05],
    "classifier__learning_rate_init": [0.001],
}

mlp_grid = sk.model_selection.GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid_mlp,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1,
)

mlp_grid.fit(X_train, y_train)

mlp_results = pd.DataFrame(mlp_grid.cv_results_)
cols_to_keep = [
    "param_classifier__hidden_layer_sizes",
    "param_classifier__alpha",
    "mean_test_score",
]
print(
    mlp_results[cols_to_keep]
    .sort_values("mean_test_score", ascending=False)
)

print("Best parameters (MLP):", mlp_grid.best_params_)
print("Best CV accuracy (MLP):", mlp_grid.best_score_)

nested_scores = sk.model_selection.cross_val_score(
    estimator=mlp_grid,
    X=X_train,
    y=y_train,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1
)

final_accuracy = np.mean(nested_scores)
print(f"\nNested CV Mean Accuracy: {final_accuracy:.4f}")
print(f"Individual Outer Fold Scores: {nested_scores}")
print("Best Hyperparameters:", mlp_grid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [32]:
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

classifier = sk.neural_network.MLPClassifier(max_iter=5000, random_state=42, hidden_layer_sizes=(200, 100), alpha=0.05, activation="tanh", solver="adam", learning_rate_init=0.001)

classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the report
print(report)

              precision    recall  f1-score   support

           0       0.50      0.34      0.40      1862
           1       0.50      0.67      0.57      1862

    accuracy                           0.50      3724
   macro avg       0.50      0.50      0.49      3724
weighted avg       0.50      0.50      0.49      3724



In [33]:
unique_values, counts = np.unique(y_test, return_counts=True)

print("Unique Values:", unique_values)
print("Counts:", counts)

Unique Values: [0 1]
Counts: [1862 1862]
