In [3]:
import tensorflow.keras as keras
from tensorflow.keras import models, layers, metrics, optimizers, losses, regularizers
from sklearn.model_selection import train_test_split, StratifiedKFold
from tensorflow.keras.callbacks import ModelCheckpoint
import xarray as xr
import numpy as np 
import pandas as pd
import random

#### Function and Metrics

In [4]:
METRICS = [
    metrics.CategoricalAccuracy(name='accuracy'),
    metrics.Precision(class_id = 1, name='precision'),
    metrics.Recall(class_id = 1, name='recall'),
    metrics.AUC(curve='PR', num_thresholds=1000, name='pr_auc')
]

def class_weights(x):
    n_class = x.max() + 1
    csize = []
    for i in range(n_class):
        csize.append((x == i).sum())
    w = dict()
    for i in range(n_class):
        w[i] = max(csize)/(csize[i]*n_class)
    return(w)
    
def onehot(x):
    y = np.zeros((x.size, x.max()+1)) 
    y[np.arange(x.size),x] = 1
    
    return(y)

#### Data preparation

In [5]:
data_x=xr.open_dataset('../input_data/Vertical_Integrated_Moisture_convergence.nc')
data_y=pd.read_csv('../processed_data/extreme_event_days.csv')
data_y['time'] = pd.to_datetime(data_y['time'], errors='coerce')
data_y=data_y.drop('Unnamed: 0', axis=1)
n_time=len(data_x.time.values)
n_lat=len(data_x.lat.values)
n_lon=len(data_x.lon.values)
x_data=data_x.VIMFC.values
x_data=(x_data-x_data.mean(axis=0))/x_data.std(axis=0)
x_data=x_data.reshape(n_time, n_lat, n_lon, 1).astype(np.float32)
y_data=np.array(data_y['label'])
y_data=y_data.astype(np.int32)
y_data_one_hot=onehot(y_data)
ind = np.arange(len(y_data))
x_train, x_test, y_train, y_test, ind_train, ind_test = train_test_split(x_data, y_data_one_hot, ind, test_size=0.20, random_state=42, 
                                                                         shuffle = True, stratify = y_data_one_hot)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(11688, 121, 121, 1) (2922, 121, 121, 1)
(11688, 2) (2922, 2)


#### Model building to test best hyper parameter

In [6]:
def build_model_test(lr, conv_filters, dense_neurons, dense_layers, activity_reg, dropout_rate, input_channels=1):
    model = models.Sequential()
    model.add(layers.Input(shape=(121, 121, input_channels)))
    
    model.add(layers.Conv2D(conv_filters, (3, 3), activity_regularizer=regularizers.l2(activity_reg)))
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Conv2D(conv_filters, (3, 3), activity_regularizer=regularizers.l2(activity_reg)))
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Flatten())
    
    for _ in range(dense_layers):
        model.add(layers.Dense(dense_neurons, activity_regularizer=regularizers.l2(activity_reg)))
        model.add(layers.Activation('relu'))
    
    model.add(layers.Dense(2, activation='softmax'))
    
    model.compile(loss=losses.CategoricalCrossentropy(), 
                  optimizer=optimizers.Adam(learning_rate=lr), 
                  metrics=METRICS)
    return model

#### StratifiedKFold to select the best lr and batch_size

In [7]:
y_labels = np.argmax(y_data_one_hot, axis=1)

param_grid = {
    'lr': [0.0001, 0.0005,  0.001, 0.005, 0.01],
    'batch_size': [128, 256, 512, 1024, 2048]
}

n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

best_params_recall = None
best_params_prauc = None
best_recall = 0
best_prauc = 0

for lr in param_grid['lr']:
    for batch_size in param_grid['batch_size']:
        print(f"\nTesting: lr={lr}, batch_size={batch_size}")
        
        recalls = []
        pr_aucs = []
        
        for train_idx, test_idx in skf.split(x_data, y_labels):
            X_train, X_test = x_data[train_idx], x_data[test_idx]
            y_train, y_test = y_data_one_hot[train_idx], y_data_one_hot[test_idx]
            y_weight = y_labels[train_idx]
            
            model_test = build_model_test(lr=lr, conv_filters=16, dense_neurons=16, dense_layers=1, activity_reg=0.001, dropout_rate=0.2)
            class_weigh = class_weights(y_weight.astype('int'))
            model_test.fit(X_train, y_train, epochs=15, batch_size=batch_size, class_weight=class_weigh, verbose=0)
            scores = model_test.evaluate(X_test, y_test, verbose=0)
            recalls.append(scores[3])
            pr_aucs.append(scores[4]) 
        
        mean_recall = np.mean(recalls)
        mean_prauc = np.mean(pr_aucs)
        print(f"Mean Recall: {mean_recall:.4f}")
        print(f"Mean PR AUC: {mean_prauc:.4f}")
        
        if mean_recall > best_recall:
            best_recall = mean_recall
            best_params_recall = {'lr': lr, 'batch_size': batch_size}
        if mean_prauc > best_prauc:
            best_prauc = mean_prauc
            best_params_prauc = {'lr': lr, 'batch_size': batch_size}

print("\nBest Hyperparameters for Stage:")
print(f" According to the best Recall :{best_recall} the best parameters is")
print(best_params_recall)
print(f" According to the best PR AUC :{best_prauc} the best parameters is")
print(best_params_prauc)


Testing: lr=0.0001, batch_size=128
Mean Recall: 0.6142
Mean PR AUC: 0.9867

Testing: lr=0.0001, batch_size=256
Mean Recall: 0.7791
Mean PR AUC: 0.9755

Testing: lr=0.0001, batch_size=512
Mean Recall: 0.8871
Mean PR AUC: 0.9451

Testing: lr=0.0001, batch_size=1024
Mean Recall: 0.8825
Mean PR AUC: 0.9014

Testing: lr=0.0001, batch_size=2048
Mean Recall: 0.9341
Mean PR AUC: 0.8592

Testing: lr=0.0005, batch_size=128
Mean Recall: 0.8646
Mean PR AUC: 0.9545

Testing: lr=0.0005, batch_size=256
Mean Recall: 0.7329
Mean PR AUC: 0.9785

Testing: lr=0.0005, batch_size=512
Mean Recall: 0.6865
Mean PR AUC: 0.9805

Testing: lr=0.0005, batch_size=1024
Mean Recall: 0.7410
Mean PR AUC: 0.9609

Testing: lr=0.0005, batch_size=2048
Mean Recall: 0.7329
Mean PR AUC: 0.9655

Testing: lr=0.001, batch_size=128
Mean Recall: 0.9304
Mean PR AUC: 0.9534

Testing: lr=0.001, batch_size=256
Mean Recall: 0.8626
Mean PR AUC: 0.9697

Testing: lr=0.001, batch_size=512
Mean Recall: 0.7526
Mean PR AUC: 0.9679

Testing: l

#### StratifiedKFold to select the best conv_filters, conv_filters and dense_layers

In [9]:
y_labels = np.argmax(y_data_one_hot, axis=1)

param_grid = {
    'conv_filters': [8, 16, 32],
    'dense_neurons': [8, 16, 32],
    'dense_layers': [1, 2, 3]
}

n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

best_params_recall = None
best_params_prauc = None
best_recall = 0
best_prauc = 0

for conv_filter in param_grid['conv_filters']:
    for dense_neuron in param_grid['dense_neurons']:
        for dense_layer in param_grid['dense_layers']:
            print(f"\nTesting: conv_filters={conv_filter}, dense_neurons={dense_neuron}, dense_layers={dense_layer}")
            
            recalls = []
            pr_aucs = []
            
            for train_idx, test_idx in skf.split(x_data, y_labels):
                X_train, X_test = x_data[train_idx], x_data[test_idx]
                y_train, y_test = y_data_one_hot[train_idx], y_data_one_hot[test_idx]
                y_weight = y_labels[train_idx]
                
                model_test = build_model_test(lr=0.0001, conv_filters=conv_filter, dense_neurons=dense_neuron, dense_layers=dense_layer, activity_reg=0.001, dropout_rate=0.2)
                class_weigh = class_weights(y_weight.astype('int'))
                model_test.fit(X_train, y_train, epochs=15, batch_size=2048, class_weight=class_weigh, verbose=0)
                scores = model_test.evaluate(X_test, y_test, verbose=0)
                recalls.append(scores[3])
                pr_aucs.append(scores[4]) 
            
            mean_recall = np.mean(recalls)
            mean_prauc = np.mean(pr_aucs)
            print(f"Mean Recall: {mean_recall:.4f}")
            print(f"Mean PR AUC: {mean_prauc:.4f}")
            
            if mean_recall > best_recall:
                best_recall = mean_recall
                best_params_recall = {'conv_filters': conv_filter, 'dense_neurons': dense_neuron, 'dense_layers': dense_layer}
            if mean_prauc > best_prauc:
                best_prauc = mean_prauc
                best_params_prauc = {'conv_filters': conv_filter, 'dense_neurons': dense_neuron, 'dense_layers': dense_layer}

print("\nBest Hyperparameters for Stage:")
print(f" According to the best Recall :{best_recall} the best parameters is")
print(best_params_recall)
print(f" According to the best PR AUC :{best_prauc} the best parameters is")
print(best_params_prauc)


Testing: conv_filters=8, dense_neurons=8, dense_layers=1
Mean Recall: 0.8976
Mean PR AUC: 0.6350

Testing: conv_filters=8, dense_neurons=8, dense_layers=2
Mean Recall: 0.8722
Mean PR AUC: 0.8044

Testing: conv_filters=8, dense_neurons=8, dense_layers=3
Mean Recall: 0.8514
Mean PR AUC: 0.7756

Testing: conv_filters=8, dense_neurons=16, dense_layers=1
Mean Recall: 0.8984
Mean PR AUC: 0.8807

Testing: conv_filters=8, dense_neurons=16, dense_layers=2
Mean Recall: 0.8467
Mean PR AUC: 0.8455

Testing: conv_filters=8, dense_neurons=16, dense_layers=3
Mean Recall: 0.8326
Mean PR AUC: 0.8662

Testing: conv_filters=8, dense_neurons=32, dense_layers=1
Mean Recall: 0.8682
Mean PR AUC: 0.8550

Testing: conv_filters=8, dense_neurons=32, dense_layers=2
Mean Recall: 0.8336
Mean PR AUC: 0.8416

Testing: conv_filters=8, dense_neurons=32, dense_layers=3
Mean Recall: 0.9059
Mean PR AUC: 0.8154

Testing: conv_filters=16, dense_neurons=8, dense_layers=1
Mean Recall: 0.9201
Mean PR AUC: 0.8490

Testing: con

#### StratifiedKFold to select the best activity_reg and dropout_rate

In [11]:
y_labels = np.argmax(y_data_one_hot, axis=1)

param_grid = {
    'activity_regs': [0.0001, 0.0005,  0.001, 0.005, 0.01, 0.05, 0.1],
    'dropout_rates': [0.1, 0.2, 0.3, 0.4],
}

n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

best_params_recall = None
best_params_prauc = None
best_recall = 0
best_prauc = 0

for activity_reg in param_grid['activity_regs']:
    for dropout_rate in param_grid['dropout_rates']:
        print(f"\nTesting: activity_regs={activity_reg}, dropout_rates={dropout_rate}")
        
        recalls = []
        pr_aucs = []
        
        for train_idx, test_idx in skf.split(x_data, y_labels):
            X_train, X_test = x_data[train_idx], x_data[test_idx]
            y_train, y_test = y_data_one_hot[train_idx], y_data_one_hot[test_idx]
            y_weight = y_labels[train_idx]
            
            model_test = build_model_test(lr=0.0001, conv_filters=32, dense_neurons=32, dense_layers=1, activity_reg=activity_reg, dropout_rate=dropout_rate)
            class_weigh = class_weights(y_weight.astype('int'))
            model_test.fit(X_train, y_train, epochs=15, batch_size=2048, class_weight=class_weigh, verbose=0)
            scores = model_test.evaluate(X_test, y_test, verbose=0)
            recalls.append(scores[3])
            pr_aucs.append(scores[4]) 
        
        mean_recall = np.mean(recalls)
        mean_prauc = np.mean(pr_aucs)
        print(f"Mean Recall: {mean_recall:.4f}")
        print(f"Mean PR AUC: {mean_prauc:.4f}")
        
        if mean_recall > best_recall:
            best_recall = mean_recall
            best_params_recall = {'activity_regs': activity_reg, 'dropout_rates': dropout_rate}
        if mean_prauc > best_prauc:
            best_prauc = mean_prauc
            best_params_prauc = {'activity_regs': activity_reg, 'dropout_rates': dropout_rate}

print("\nBest Hyperparameters for Stage:")
print(f" According to the best Recall :{best_recall} the best parameters is")
print(best_params_recall)
print(f" According to the best PR AUC :{best_prauc} the best parameters is")
print(best_params_prauc)


Testing: activity_regs=0.0001, dropout_rates=0.1
Mean Recall: 0.8495
Mean PR AUC: 0.9678

Testing: activity_regs=0.0001, dropout_rates=0.2
Mean Recall: 0.9351
Mean PR AUC: 0.9138

Testing: activity_regs=0.0001, dropout_rates=0.3
Mean Recall: 0.8579
Mean PR AUC: 0.9637

Testing: activity_regs=0.0001, dropout_rates=0.4
Mean Recall: 0.8928
Mean PR AUC: 0.9220

Testing: activity_regs=0.0005, dropout_rates=0.1
Mean Recall: 0.8985
Mean PR AUC: 0.9307

Testing: activity_regs=0.0005, dropout_rates=0.2
Mean Recall: 0.8561
Mean PR AUC: 0.9621

Testing: activity_regs=0.0005, dropout_rates=0.3
Mean Recall: 0.9031
Mean PR AUC: 0.9373

Testing: activity_regs=0.0005, dropout_rates=0.4
Mean Recall: 0.9059
Mean PR AUC: 0.9026

Testing: activity_regs=0.001, dropout_rates=0.1
Mean Recall: 0.9182
Mean PR AUC: 0.9407

Testing: activity_regs=0.001, dropout_rates=0.2
Mean Recall: 0.8626
Mean PR AUC: 0.9629

Testing: activity_regs=0.001, dropout_rates=0.3
Mean Recall: 0.8900
Mean PR AUC: 0.9267

Testing: act