# Imports

In [30]:
%run Imports.ipynb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

name = 'Kred'

key_featsubgroups = pd.read_pickle('../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../pickle/2_FS/' + name + '/2_df_new_.pkl')

In [31]:
np.__version__

'2.2.0'

In [32]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


In [33]:
param_grid = {
    "hidden_layer_size": [16, 32, 64, 128, 200, 256, 300, 350, 400, 406],
    "activation": ["relu", "tanh"],
    "optimizer": ["adam", "sgd"],
    "learning_rate": [0.001, 0.005, 0.01, 0.05],
    "batch_size": [16, 32, 64],
    "epochs": [10, 20, 30]
}


In [34]:
with open('../pickle/4_Model_Optuna/results_dict_updated.pkl', 'rb') as f:
    results_dict = pickle.load(f)

In [35]:
with open('../pickle/3_Model/xgb_default_params.pkl', 'rb') as f:
    default_params = pickle.load(f)


# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [28]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model training and prediction

In [29]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


# 7) Neural Network

In [None]:
def keras_nn(param_grid, df, results_dict):
    """
    Performs Bayesian optimization for a Keras neural network using Optuna.

    The network has:
    - 406 input neurons (fixed)
    - One hidden layer (size and activation tunable)
    - One output neuron with sigmoid activation (binary classification)

    Parameters
    ----------
    default_params : dict
        Default hyperparameters (values as lists).
    param_grid : dict
        Search space for hyperparameters.
    df : DataFrame
        Input dataset.
    results_dict : dict
        Evaluation results dictionary.

    Returns
    -------
    best_params : dict
        Best-found parameters by Optuna.
    results_dict : dict
        Updated with evaluation metrics.
    """

    # Replace with your own function
    X_train, X_test, y_train, y_test = split_data_4(df)

    n_trials = int(np.cumsum([len(v) for v in param_grid.values()])[-1])
    print("No. of trials:", n_trials)

    def objective(trial):
        params = {k: trial.suggest_categorical(k, v) for k, v in param_grid.items()}

        model = Sequential()
        model.add(Dense(params["hidden_layer_size"], activation=params["activation"], input_shape=(406,)))
        model.add(Dense(1, activation="sigmoid"))

        optimizer = Adam(learning_rate=params["learning_rate"]) if params["optimizer"] == "adam" \
                   else SGD(learning_rate=params["learning_rate"])

        model.compile(optimizer=optimizer, loss=BinaryCrossentropy(), metrics=["accuracy"])

        history = model.fit(X_train, y_train, epochs=params["epochs"], batch_size=params["batch_size"],
                            validation_split=0.2, verbose=0)

        return history.history["val_accuracy"][-1]

    start_time = time.time()
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    # Build and train final model
    final_model = Sequential()
    final_model.add(Dense(best_params["hidden_layer_size"], activation=best_params["activation"], input_shape=(406,)))
    final_model.add(Dense(1, activation="sigmoid"))

    final_optimizer = Adam(learning_rate=best_params["learning_rate"]) if best_params["optimizer"] == "adam" \
                      else SGD(learning_rate=best_params["learning_rate"])

    final_model.compile(optimizer=final_optimizer, loss=BinaryCrossentropy(), metrics=["accuracy"])
    final_model.fit(X_train, y_train, batch_size=best_params["batch_size"], epochs=best_params["epochs"], verbose=0)

    # Evaluate and log
    results_dict = model_pred(X_train, X_test, y_train, y_test, final_model, 'keras_optuna', 'opt', results_dict)
    print(f"Optimization completed in {(time.time() - start_time) / 60:.2f} minutes")

    return best_params, results_dict


In [None]:
best_params, results_dict = keras_nn(param_grid, df, results_dict)