# Imports

In [1]:
%run Imports.ipynb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

name = 'Kred'

key_featsubgroups = pd.read_pickle('../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../pickle/2_FS/' + name + '/2_df_new_.pkl')

  from .autonotebook import tqdm as notebook_tqdm
2025-06-05 15:38:37.871448: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-05 15:38:37.926252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [3]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model training and prediction

In [4]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


In [None]:
def keras_nn(default_params, param_grid, df, results_dict):
    """
    Performs Bayesian hyperparameter optimization for a Keras-based neural network using Optuna.
    Only one hidden layer is used; its size and other hyperparameters are optimized.

    Parameters:
    -----------
    default_params : dict
        Default hyperparameters (each as a list).
    param_grid : dict
        Search space for hyperparameters.
    df : DataFrame
        Input data with features and target.
    results_dict : dict
        Dictionary to store results.

    Returns:
    --------
    best_params : dict
        Best hyperparameters found by Optuna.
    results_dict : dict
        Updated with evaluation metrics.
    """
    
    start_time = time.time()
    X_train, X_test, y_train, y_test = split_data_4(df)

    n_trials = int(np.cumsum([len(v) for v in param_grid.values()])[-1])
    print("No. of trials:", n_trials)

    def objective(trial):
        # Sample hyperparameters
        params = {k: trial.suggest_categorical(k, v) for k, v in param_grid.items()}

        model = Sequential()
        model.add(Dense(params["hidden_layer_size"], activation=params["activation"], input_shape=(X_train.shape[1],)))
        model.add(Dense(1, activation="sigmoid"))

        optimizer = Adam(learning_rate=params["learning_rate"]) if params["optimizer"] == "adam" else SGD(learning_rate=params["learning_rate"])
        model.compile(optimizer=optimizer, loss=BinaryCrossentropy(), metrics=["accuracy"])

        history = model.fit(X_train, y_train, validation_split=0.2, batch_size=params["batch_size"], epochs=params["epochs"], verbose=0)
        val_acc = history.history["val_accuracy"][-1]

        return val_acc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    # Train final model
    final_model = Sequential()
    final_model.add(Dense(best_params["hidden_layer_size"], activation=best_params["activation"], input_shape=(X_train.shape[1],)))
    final_model.add(Dense(1, activation="sigmoid"))
    final_optimizer = Adam(learning_rate=best_params["learning_rate"]) if best_params["optimizer"] == "adam" else SGD(learning_rate=best_params["learning_rate"])
    final_model.compile(optimizer=final_optimizer, loss=BinaryCrossentropy(), metrics=["accuracy"])
    final_model.fit(X_train, y_train, batch_size=best_params["batch_size"], epochs=best_params["epochs"], verbose=0)

    results_dict = model_pred(X_train, X_test, y_train, y_test, final_model, 'keras_optuna', 'opt', results_dict)

    print(f"Optimization completed in {(time.time() - start_time) / 60:.2f} minutes")
    return best_params, results_dict


# 7) Neural Network

In [None]:
def nn(x_train, n_layers: int = 1, layer_depth: int = 64, loss='binary_crossentropy', metrics=['accuracy', 'mse']):
    """
    Builds a neural network with a variable number of hidden Dense layers.

    Parameters:
    n_layers (int): Number of hidden Dense layers.
    units (int): Number of neurons in each hidden layer.

    Returns:
    model: The compiled Keras model.
    """
    layers = []

    # Add the first hidden layer with input shape
    layers.append(Dense(layer_depth, activation='relu', input_shape=(x_train.shape[1],)))

    # Add (n_layers - 1) more hidden layers
    for _ in range(n_layers - 1):
        layers.append(Dense(layer_depth, activation='relu'))

    # Add output layer (binary classification)
    layers.append(Dense(1, activation='sigmoid'))

    # Create and compile model
    model = Sequential(layers)
    model.compile(optimizer='adam', loss=loss, metrics=metrics, )

    return model

## 7.1) NN without tuning 

In [7]:
model = Sequential([
    Dense(1, input_shape=(406,))
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 407       
                                                                 
Total params: 407 (1.59 KB)
Trainable params: 407 (1.59 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


2025-06-05 15:39:27.639841: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
output = model(x_train)

KeyboardInterrupt: 

## 7.2) NN with tuning

In [9]:
n = nn(x_train)

In [10]:
n.summary()
n.fit(x_train, y_train, epochs=50, batch_size=15, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 64)                26048     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 26113 (102.00 KB)
Trainable params: 26113 (102.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


KeyboardInterrupt: 