In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# LOAD DATA
flood_df = pd.read_csv('../data/cleaned_flood_data.csv')
non_flood_df = pd.read_csv('../data/cleaned_non_flood_data.csv')

print(f"Flood events: {len(flood_df)}")
print(f"Non-flood events: {len(non_flood_df)}")

# Combine
full_df = pd.concat([flood_df, non_flood_df])
# Remove columns that are not needed for modeing
full_df = full_df.drop(['precipitation_sum', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], 
                        axis=1, errors='ignore')

# CREATE TIME SEQUENCES

# groups events by spatial grid cell and builds sliding windows of sequential months,
#each sequence becomes one training sample for the LSTM
def create_temporal_sequences(df, sequence_length=5, grid_size=0.5):
    df = df.copy()
    df['time_idx'] = df['YEAR'] * 12 + df['MONTH']
    
    # convert lat/lon into grid IDs
    df['grid_lat'] = (df['BEGIN_LAT'] / grid_size).astype(int)
    df['grid_lon'] = (df['BEGIN_LON'] / grid_size).astype(int)
    df['grid_id'] = df['grid_lat'].astype(str) + '_' + df['grid_lon'].astype(str)

    # features that will be used as inpt to the LSTM
    feature_cols = [
        'YEAR', 'MONTH', 'BEGIN_TIME', 'BEGIN_LAT', 'BEGIN_LON',
        'temperature_2m_mean', 'wind_speed_10m_mean', 'cloud_cover_mean',
        'relative_humidity_2m_mean', 'dew_point_2m_mean', 'rain_sum',
        'pressure_msl_mean', 'soil_moisture_0_to_10cm_mean', 'elevation',
        'is_primary_rain_season', 'is_secondary_rain_season',
        'Flood_Zone_A', 'Flood_Zone_AE', 'Flood_Zone_AH', 'Flood_Zone_AO',
        'Flood_Zone_AREA NOT INCLUDED', 'Flood_Zone_OPEN WATER', 'Flood_Zone_VE', 'Flood_Zone_X',
        'Is_In_Floodplain_False', 'Is_In_Floodplain_True'
    ]
    
    flood_cols = [col for col in df.columns if col.startswith('Flood_Zone_') or col.startswith('Is_In_Floodplain_')]
    feature_cols.extend(flood_cols)
    
    sequences = []
    labels = []
    
    # Process each grid cell independently
    for grid_id, group in df.groupby('grid_id'):
        if len(group) < sequence_length:
            continue

        # Sorts
        group = group.sort_values('time_idx').reset_index(drop=True)

        # Build sliding windows of length sequence length
        for i in range(len(group) - sequence_length + 1):
            window = group.iloc[i:i+sequence_length]

            # Extract sequence features
            seq_features = window[feature_cols].values

            # labels (1 = flood event in last timestep)
            last_event = window.iloc[-1]
            label = 1 if last_event['EVENT_TYPE'] == 'Flash Flood' else 0
            
            sequences.append(seq_features)
            labels.append(label)
    
    return np.array(sequences), np.array(labels), feature_cols

print("\nCreating temporal sequences:")
X_sequences, y_sequences, feature_names = create_temporal_sequences(
    full_df, 
    sequence_length=5,
    grid_size=0.5
)

print(f"Total sequences created: {len(X_sequences)}")
print(f"Sequence shape: {X_sequences.shape}")

# TRAIN-TEST SPLIT
# Split sequences and labels so the model can be evaluated fairly
X_train, X_test, y_train, y_test = train_test_split(
    X_sequences, y_sequences,
    test_size=0.2,
    random_state=42,
    stratify=y_sequences
)

# scale data
scaler = StandardScaler()
X_train_2d = X_train.reshape(-1, X_train.shape[2])
X_test_2d = X_test.reshape(-1, X_test.shape[2])

X_train_scaled = scaler.fit_transform(X_train_2d).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_2d).reshape(X_test.shape)

print(f"\nTrain set: {len(X_train_scaled)} sequences")
print(f"Test set: {len(X_test_scaled)} sequences")

# BUILD LSTM MODEL
def build_lstm_model(input_shape):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.3),
        LSTM(32, return_sequences=False),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

input_shape = (X_train_scaled.shape[1], X_train_scaled.shape[2])
lstm_model = build_lstm_model(input_shape)


# calculte class weights
class_weight = {
    0: len(y_train) / (2 * (y_train == 0).sum()),
    1: len(y_train) / (2 * (y_train == 1).sum())
}

# Stop training early when validation loss stops improving
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

# TRAIN MODEL
print("TRAINING LSTM MODEL")

history = lstm_model.fit(
    X_train_scaled, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weight,
    callbacks=[early_stop],
    verbose=1
)

# EVALUATE
y_pred_proba = lstm_model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

test_accuracy = accuracy_score(y_test, y_pred)

print("LSTM MODEL RESULTS")
print(f"Test Accuracy: {test_accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Flood', 'Flood']))

# PLOT ACCURACY
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
plt.title('Model Accuracy', fontsize=14, fontweight='bold')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('lstm_accuracy.png', dpi=300, bbox_inches='tight')

Flood events: 9308
Non-flood events: 9308

Creating temporal sequences:
Total sequences created: 17516
Sequence shape: (17516, 5, 36)

Train set: 14012 sequences
Test set: 3504 sequences


  super().__init__(**kwargs)


TRAINING LSTM MODEL
Epoch 1/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5125 - loss: 0.6930 - val_accuracy: 0.5155 - val_loss: 0.6915
Epoch 2/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5054 - loss: 0.6903 - val_accuracy: 0.4877 - val_loss: 0.6914
Epoch 3/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5103 - loss: 0.6883 - val_accuracy: 0.5191 - val_loss: 0.6879
Epoch 4/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5207 - loss: 0.6875 - val_accuracy: 0.5244 - val_loss: 0.6843
Epoch 5/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5273 - loss: 0.6852 - val_accuracy: 0.5605 - val_loss: 0.6816
Epoch 6/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5455 - loss: 0.6812 - val_accuracy: 0.5419 - val_loss: 0.6839
Ep