# Imports

In [15]:
%run Imports.ipynb
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    f1_score,
    roc_auc_score
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD

name = 'Kred'

key_featsubgroups = pd.read_pickle('../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../pickle/2_FS/' + name + '/2_df_new_.pkl')

In [16]:
np.__version__

'2.1.3'

In [17]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


In [18]:
param_grid = {
    "hidden_layer_size": [16, 32, 64, 128, 200, 256, 300, 350, 400, 406],
    "activation": ["relu", "tanh"],
    "optimizer": ["adam", "sgd"],
    "learning_rate": [0.001, 0.005, 0.01, 0.05],
    "batch_size": [16, 32, 64],
    "epochs": [10, 20, 30]
}


In [None]:
# params grid to test function
pg = {
    "hidden_layer_size": [16, 32],
    "activation": ["relu", "tanh"],
    "optimizer": ["adam", "sgd"],
    "learning_rate": [0.05],
    "batch_size": [16],
    "epochs": [10]
}


In [19]:
with open('../pickle/4_Model_Optuna/results_dict_updated.pkl', 'rb') as f:
    results_dict = pickle.load(f)

In [20]:
with open('../pickle/3_Model/xgb_default_params.pkl', 'rb') as f:
    default_params = pickle.load(f)


# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [21]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model training and prediction

In [22]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


In [23]:
def keras_nn(param_grid, df, results_dict):
    """
    Tunes and trains a single-hidden-layer Keras NN via Optuna,
    evaluates performance, and stores results in the given dictionary.

    Architecture:
    - Input layer: 406 features
    - Hidden layer: size & activation from param_grid
    - Output layer: 1 neuron, sigmoid

    Parameters:
    -----------
    param_grid : dict
        Search space for Optuna. Keys: 
        "hidden_layer_size", "activation", "optimizer",
        "learning_rate", "batch_size", "epochs"
    df : DataFrame
        Dataset containing features and the target.
    results_dict : dict
        Dictionary to store model performance metrics.

    Returns:
    --------
    best_params : dict
        Best parameters found by Optuna.
    results_dict : dict
        Updated results dictionary containing model evaluation metrics.
    """
    start_time = time.time()
    print("\nStarting Optuna hyperparameter search for Keras NN...")

    # split_data_4 must return pandas DataFrames/Series
    X_train, X_test, y_train, y_test = split_data_4(df)

    # total trials = sum of grid sizes
    n_trials = int(np.sum([len(v) for v in param_grid.values()]))
    print(f"No. of trials scheduled: {n_trials}")

    def objective(trial):
        # sample hyperparameters
        params = {k: trial.suggest_categorical(k, v)
                  for k, v in param_grid.items()}

        # build model
        model = Sequential([
            Dense(params["hidden_layer_size"],
                  activation=params["activation"],
                  input_shape=(X_train.shape[1],)),
            Dense(1, activation="sigmoid")
        ])

        # choose optimizer
        optimizer = (Adam(params["learning_rate"])
                     if params["optimizer"] == "adam"
                     else SGD(params["learning_rate"]))

        model.compile(
            optimizer=optimizer,
            loss="binary_crossentropy",
            metrics=["accuracy"]
        )

        # train & return validation accuracy
        history = model.fit(
            X_train, y_train,
            epochs=params["epochs"],
            batch_size=params["batch_size"],
            validation_split=0.2,
            verbose=0
        )
        return history.history["val_accuracy"][-1]

    # run Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    print("Best hyperparameters found:", best_params)

    # build & train final model
    final_model = Sequential([
        Dense(best_params["hidden_layer_size"],
              activation=best_params["activation"],
              input_shape=(X_train.shape[1],)),
        Dense(1, activation="sigmoid")
    ])
    final_optimizer = (Adam(best_params["learning_rate"])
                       if best_params["optimizer"] == "adam"
                       else SGD(best_params["learning_rate"]))
    final_model.compile(
        optimizer=final_optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    final_model.fit(
        X_train, y_train,
        epochs=best_params["epochs"],
        batch_size=best_params["batch_size"],
        verbose=0
    )

    # cross-validated predictions on train set using iloc
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    y_train_pred = np.zeros(len(y_train), dtype=int)
    for train_idx, val_idx in skf.split(X_train, y_train):
        fold_X_tr = X_train.iloc[train_idx]
        fold_y_tr = y_train.iloc[train_idx]
        fold_X_val = X_train.iloc[val_idx]

        # re-build & train for this fold
        fold_model = Sequential([
            Dense(best_params["hidden_layer_size"],
                  activation=best_params["activation"],
                  input_shape=(X_train.shape[1],)),
            Dense(1, activation="sigmoid")
        ])
        fold_model.compile(
            optimizer=final_optimizer,
            loss="binary_crossentropy",
            metrics=["accuracy"]
        )
        fold_model.fit(
            fold_X_tr, fold_y_tr,
            epochs=best_params["epochs"],
            batch_size=best_params["batch_size"],
            verbose=0
        )

        preds = (fold_model.predict(fold_X_val)[:, 0] > 0.5).astype(int)
        y_train_pred[val_idx] = preds

    # test set predictions
    y_test_proba = final_model.predict(X_test)[:, 0]
    y_test_pred  = (y_test_proba > 0.5).astype(int)

    # compute metrics
    cfm_train    = confusion_matrix(y_train, y_train_pred)
    cfm_test     = confusion_matrix(y_test,  y_test_pred)
    acc_train    = accuracy_score(y_train, y_train_pred)
    acc_test     = accuracy_score(y_test,  y_test_pred)
    f1_train_1   = f1_score(y_train, y_train_pred, pos_label=1)
    f1_train_0   = f1_score(y_train, y_train_pred, pos_label=0)
    f1_test_1    = f1_score(y_test,  y_test_pred,  pos_label=1)
    f1_test_0    = f1_score(y_test,  y_test_pred,  pos_label=0)
    roc_auc_test = roc_auc_score(y_test, y_test_proba)

    total_time = (time.time() - start_time) / 60
    print(f"Keras NN optimization & training completed in {total_time:.2f} minutes")

    # store everything
    results_dict["keras_nn"] = {
        "model":                  deepcopy(final_model),
        "cfm_train":              cfm_train,
        "cfm_test":               cfm_test,
        "train_accuracy":         acc_train,
        "test_accuracy":          acc_test,
        "train F1-score label 1": f1_train_1,
        "train F1-score label 0": f1_train_0,
        "test F1-score label 1":  f1_test_1,
        "test F1-score label 0":  f1_test_0,
        "test roc auc score":     roc_auc_test,
        "param_grid":             param_grid,
        "best_params":            best_params,
        "time_m":                 total_time
    }

    return best_params, results_dict

# 7) Neural Network

In [26]:
best_params, results_dict = keras_nn(pg, df, results_dict)


Starting Optuna hyperparameter search for Keras NN...
Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']


[I 2025-06-10 15:30:49,140] A new study created in memory with name: no-name-048c60b4-c1ff-4b08-a53e-51c41e969a58


Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)
No. of trials scheduled: 9


[I 2025-06-10 15:32:09,709] Trial 0 finished with value: 0.35948437452316284 and parameters: {'hidden_layer_size': 32, 'activation': 'tanh', 'optimizer': 'sgd', 'learning_rate': 0.05, 'batch_size': 16, 'epochs': 10}. Best is trial 0 with value: 0.35948437452316284.
[I 2025-06-10 15:33:32,710] Trial 1 finished with value: 0.6419156789779663 and parameters: {'hidden_layer_size': 32, 'activation': 'tanh', 'optimizer': 'adam', 'learning_rate': 0.05, 'batch_size': 16, 'epochs': 10}. Best is trial 1 with value: 0.6419156789779663.
[I 2025-06-10 15:34:54,533] Trial 2 finished with value: 0.6420605182647705 and parameters: {'hidden_layer_size': 16, 'activation': 'tanh', 'optimizer': 'adam', 'learning_rate': 0.05, 'batch_size': 16, 'epochs': 10}. Best is trial 2 with value: 0.6420605182647705.
[I 2025-06-10 15:36:13,164] Trial 3 finished with value: 0.3580842912197113 and parameters: {'hidden_layer_size': 16, 'activation': 'relu', 'optimizer': 'sgd', 'learning_rate': 0.05, 'batch_size': 16, 'ep

Best hyperparameters found: {'hidden_layer_size': 16, 'activation': 'tanh', 'optimizer': 'adam', 'learning_rate': 0.05, 'batch_size': 16, 'epochs': 10}


NotImplementedError: numpy() is only available when eager execution is enabled.

In [None]:
with open('../pickle/7_Model/keras_nn_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('../pickle/7_Model/keras_nn_best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f, pickle.HIGHEST_PROTOCOL)