In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn as sk
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report

flood_df = pd.read_csv('../data/cleaned_flood_data.csv')
non_flood_df = pd.read_csv('../data/cleaned_non_flood_data.csv')
full_df = pd.concat([flood_df, non_flood_df])

full_df = full_df.drop(['precipitation_sum', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], axis=1)

for column in full_df.columns:
    num_na = full_df[full_df[column].isna()].shape[0]
    print(f"For column {column} there are {num_na} missing values.")

X = full_df.drop(columns=["EVENT_TYPE"])
y = full_df["EVENT_TYPE"]

le = sk.preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y
)

scaler = sk.preprocessing.StandardScaler()
mlp = sk.neural_network.MLPClassifier(max_iter=5000, random_state=42)

pipeline_mlp = sk.pipeline.Pipeline(
    steps=[
        ("scaler", scaler),
        ("classifier", mlp),
    ]
)

param_grid_mlp = {
    "classifier__hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50)],
    "classifier__activation": ["relu", "tanh"],
    "classifier__solver": ["adam"],
    "classifier__alpha": [0.0001, 0.001, 0.01],
    "classifier__learning_rate_init": [0.001, 0.01],
}

mlp_grid = sk.model_selection.GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid_mlp,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1,
)

mlp_grid.fit(X_train, y_train)

mlp_results = pd.DataFrame(mlp_grid.cv_results_)
cols_to_keep = [
    "param_classifier__hidden_layer_sizes",
    "param_classifier__alpha",
    "mean_test_score",
]
print(
    mlp_results[cols_to_keep]
    .sort_values("mean_test_score", ascending=False)
)

print("Best parameters (MLP):", mlp_grid.best_params_)
print("Best CV accuracy (MLP):", mlp_grid.best_score_)

nested_scores = sk.model_selection.cross_val_score(
    estimator=mlp_grid,
    X=X_train,
    y=y_train,
    cv=5,
    scoring="accuracy",
    verbose=3,
    n_jobs=-1
)

final_accuracy = np.mean(nested_scores)
print(f"\nNested CV Mean Accuracy: {final_accuracy:.4f}")
print(f"Individual Outer Fold Scores: {nested_scores}")

# ============================================================================
# LSTM MODEL
# ============================================================================

print("\n" + "="*80)
print("LSTM MODEL")
print("="*80)

# Prepare data for LSTM
scaler_lstm = sk.preprocessing.StandardScaler()
X_train_scaled = scaler_lstm.fit_transform(X_train)
X_test_scaled = scaler_lstm.transform(X_test)

# Reshape for LSTM: (samples, timesteps, features)
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

print(f"LSTM Input Shape: {X_train_lstm.shape}")

# Define LSTM model
def create_lstm_model(lstm_units=50, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        LSTM(lstm_units, input_shape=(1, X_train_lstm.shape[2]), return_sequences=True),
        Dropout(dropout_rate),
        LSTM(lstm_units // 2),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Grid search for LSTM
lstm_model = KerasClassifier(model=create_lstm_model, epochs=50, batch_size=32, verbose=0)

param_grid_lstm = {
    'model__lstm_units': [32, 64],
    'model__dropout_rate': [0.2, 0.3],
    'model__learning_rate': [0.001, 0.01],
    'batch_size': [32, 64]
}

print("\nStarting LSTM Grid Search...")
lstm_grid = GridSearchCV(
    estimator=lstm_model,
    param_grid=param_grid_lstm,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=1
)

lstm_grid.fit(X_train_lstm, y_train)

print("\nBest parameters (LSTM):", lstm_grid.best_params_)
print("Best CV accuracy (LSTM):", lstm_grid.best_score_)

# Evaluate LSTM on test set
y_pred_lstm = lstm_grid.predict(X_test_lstm)
lstm_test_accuracy = accuracy_score(y_test, y_pred_lstm)
print(f"LSTM Test Set Accuracy: {lstm_test_accuracy:.4f}")

print("\nLSTM Classification Report:")
print(classification_report(y_test, y_pred_lstm, target_names=le.classes_))