# imports and data loading

In [2]:
%run Imports.ipynb
name = 'Kred'

# specific imports
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


key_featsubgroups = pd.read_pickle('../../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../../pickle/2_FS/' + name + '/2_df_new_.pkl')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


In [4]:
# param grid scikit
param_grid = {
    'hidden_layer_sizes': [(h,) for h in [5, 10, 20, 50, 100, 200, 300, 406]],
    'activation': ['relu', 'tanh', 'logistic'],                   # activation functions
    'solver': ['adam', 'sgd'],                                    # optimization algorithms
    'alpha': [0.0001, 0.001, 0.01, 0.1],                                # L2 regularization
    'learning_rate': ['constant', 'adaptive'],                    # learning rate strategy
    'learning_rate_init': [0.001, 0.005, 0.01, 0.05],                           # initial learning rate
    'max_iter': [100, 200, 500, 750, 1000]                                        # training epochs
}

In [5]:
with open('../../pickle/4_Model_Optuna/results_dict_updated.pkl', 'rb') as f:
    results_dict = pickle.load(f)

In [6]:
with open('../../pickle/3_Model/xgb_default_params.pkl', 'rb') as f:
    default_params = pickle.load(f)


In [7]:
with open('../../pickle/4_Model_Optuna/results_dict_updated_5.pkl', 'rb') as f:
    rd = pickle.load(f)

# data splitting

In [8]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

In [9]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


# Task 2

In [10]:
with open('../../pickle/Monedo_7/without_optuna/best_params.pkl', 'rb') as f:
    best_params = pickle.load(f)


In [None]:
def train_nn(df, best_params, results_dict):
    start_time = time.time()
    print("\nStarting XGBoost model training and evaluation...")

    # Split dataset
    X_train, X_test, y_train, y_test = split_data_4(df)

    # Initialize and train EBM with best parameters
    nn =  MLPClassifier(**best_params, random_state=0)
    nn.fit(X_train, y_train)
    fi = nn.feature_importances_


    # Perform cross-validation with CV=5
    y_train_pred = cross_val_predict(nn, X_train, y_train, cv=5)

    # Make predictions on test set
    y_test_pred = nn.predict(X_test)

    # Compute confusion matrices
    cfm_train = confusion_matrix(y_train, y_train_pred)
    cfm_test = confusion_matrix(y_test, y_test_pred)

    # Compute accuracy scores
    accs_train = accuracy_score(y_train, y_train_pred)
    accs_test = accuracy_score(y_test, y_test_pred)

    # Compute F1-scores for both classes (0 and 1)
    f1s_train_p1 = f1_score(y_train, y_train_pred, pos_label=1)
    f1s_train_p0 = f1_score(y_train, y_train_pred, pos_label=0)
    f1s_test_p1 = f1_score(y_test, y_test_pred, pos_label=1)
    f1s_test_p0 = f1_score(y_test, y_test_pred, pos_label=0)

    # Compute ROC-AUC score for test data
    test_ras = roc_auc_score(y_test, nn.predict_proba(X_test)[:, 1])

    # Calculate total runtime in minutes
    total_time = (time.time() - start_time) / 60
    print(f"Sklearn neural network Model training completed in {total_time:.2f} minutes")

    # Store computed values in results dictionary
    results_dict["sklearn_nn"] = {
        "classifier": deepcopy(nn),
        "cfm_train": cfm_train,
        "cfm_test": cfm_test,
        "train_accuracy": accs_train,
        "test_accuracy": accs_test,
        "train F1-score label 1": f1s_train_p1,
        "train F1-score label 0": f1s_train_p0,
        "test F1-score label 1": f1s_test_p1,
        "test F1-score label 0": f1s_test_p0,
        "test roc auc score": test_ras,
        "best_params": best_params,
        "feature_imp": fi,
        "time_m": total_time
    }

    return results_dict

In [12]:
rd = train_nn(df, best_params, rd)


Starting XGBoost model training and evaluation...
Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)
Sklearn neural network Model training completed in 37.24 minutes


In [13]:
with open('../../pickle/4_Model_Optuna/results_dict_updated_5.pkl', 'wb') as f:
    pickle.dump(rd, f, pickle.HIGHEST_PROTOCOL)

In [9]:
# choosing optimal parameters
clf_best = MLPClassifier(**best_params, random_state=0)
clf_best.fit(x_train, y_train)

In [10]:
# Predict test data
y_test_pred  = clf_best.predict(x_test)
y_test_proba = clf_best.predict_proba(x_test)[:, 1]

In [11]:
# Extract feature importance score from trained model using permutation-based feature importance scoring
from sklearn.inspection import permutation_importance

perm_imp = permutation_importance(
    clf_best, x_test, y_test,
    n_repeats=30, random_state=0, n_jobs=-1
)
feature_importances = perm_imp.importances_mean

In [12]:
# Store test set predictions and feature importance scores

results_M7_T2 = {
    "y_test_pred":         y_test_pred,
    "y_test_proba":        y_test_proba,
    "feature_importances": feature_importances
}

# please update the path as needed
with open("../../pickle/Monedo_7/without_optuna/results_M7_T2.pkl", "wb") as f:
    pickle.dump(results_M7_T2, f, protocol=pickle.HIGHEST_PROTOCOL)