HistGradientBoostingClassifier

In [1]:
from functions_PR_7 import *

In [2]:
#from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('data/data_reduced_ML.csv')

In [5]:
X = df.drop('Bankrupt', axis=1).copy()
y = df['Bankrupt'].copy()

## Run model for imbalanced data

In [10]:
X_train, X_test, y_train, y_test = split_scale(X, y, test_size=0.2, random_state=4576)

clf = HistGradientBoostingClassifier(random_state=4576)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [13]:
scores = model_score(y_test, y_pred)

display(scores)

print(confusion_matrix(y_test, y_pred))

Unnamed: 0,precission,accuracy,recall,f1_score
0,0.636,0.966,0.14,0.23


[[1310    4]
 [  43    7]]


In [8]:
y_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

In [9]:
y_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

## Uscale data and then run the model

In [14]:
X_train_up, y_train_up = data_resampling('up', X_train, y_train)

clf_up = HistGradientBoostingClassifier(random_state=4576)

clf_up.fit(X_train_up, y_train_up)

y_pred_up = clf_up.predict(X_test)

score_up = model_score(y_test, y_pred_up)

In [23]:
X_train_up.shape

(10570, 18)

In [15]:
y_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

In [16]:
y_train_up.value_counts()

Bankrupt
0    5285
1    5285
Name: count, dtype: int64

In [17]:
y_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

In [18]:
display(score_up)

print(confusion_matrix(y_test, y_pred_up))

Unnamed: 0,precission,accuracy,recall,f1_score
0,0.444,0.96,0.32,0.372


[[1294   20]
 [  34   16]]


LR results with upscaled data

## Calibrating the model

In [20]:
from sklearn.calibration import CalibratedClassifierCV

In [21]:
initial_model = HistGradientBoostingClassifier(random_state=4576)

calibrated_model = CalibratedClassifierCV(initial_model, method='sigmoid', cv='prefit')

initial_model.fit(X_train_up, y_train_up)

calibrated_model.fit(X_train_up, y_train_up)

y_pred_calibrated = calibrated_model.predict(X_test)

scores_calibrated = model_score(y_test, y_pred_calibrated)

In [22]:
print("Scores for calibrated model (with upscaled training data)")
display(scores_calibrated)
print(confusion_matrix(y_test, y_pred_calibrated))

Scores for calibrated model (with upscaled training data)


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.75,0.968,0.18,0.29


[[1311    3]
 [  41    9]]


In [23]:
y_train_up.value_counts()

Bankrupt
0    5285
1    5285
Name: count, dtype: int64

## Hyperparameter search

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

**Tuning f1 score**

In [34]:
# Define the parameter grid for grid search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'max_leaf_nodes': [15, 31, 63],
    'max_bins': [50, 100, 200, 250],
    'l2_regularization': [0.0, 0.1, 0.5],
    'random_state': [42]
}

scorers = {
    'f1_score': make_scorer(f1_score),
}


# Create the HistGradientBoostingClassifier
hgc = HistGradientBoostingClassifier(max_iter=500)

# Perform grid search with cross-validation
grid_search = GridSearchCV(hgc, param_grid, cv=5, scoring=scorers, refit='f1_score', n_jobs=-1)

grid_search.fit(X_train_up, y_train_up)

In [48]:
best_params = grid_search.best_params_

best_model = grid_search.best_estimator_

best_model.fit(X_train_up, y_train_up)

y_pred_best = best_model.predict(X_test)

In [52]:
best_scores = model_score(y_test, y_pred_best)

print("Scores after hypertuning of f1_score (with upscaled training data)")
display(best_scores)
print(confusion_matrix(y_test, y_pred_best))

Scores after hypertuning of f1_score (with upscaled training data)


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.579,0.966,0.22,0.319


[[1306    8]
 [  39   11]]


In [38]:
grid_search.best_estimator_

In [None]:
loss='log_loss',
    *,
    learning_rate=0.1,
    max_iter=100,
    max_leaf_nodes=31,
    max_depth=None,
    min_samples_leaf=20,
    l2_regularization=0.0,
    max_features=1.0,
    max_bins=255,
    categorical_features='warn',
    monotonic_cst=None,
    interaction_cst=None,
    warm_start=False,
    early_stopping='auto',
    scoring='loss',
    validation_fraction=0.1,
    n_iter_no_change=10,
    tol=1e-07,
    verbose=0,
    random_state=None,
    class_weight=None,

**Tuning recall**

In [44]:
from sklearn.metrics import recall_score

In [45]:
# Define the parameter grid for grid search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'max_leaf_nodes': [15, 31, 63],
    'max_bins': [50, 100, 200, 250],
    'l2_regularization': [0.0, 0.1, 0.5],
    'random_state': [42]
}

scorers = {
    'recall': make_scorer(recall_score),
}


# Create the HistGradientBoostingClassifier
hgc_2 = HistGradientBoostingClassifier(max_iter=500)

# Perform grid search with cross-validation
grid_search_2 = GridSearchCV(hgc_2, param_grid, cv=5, scoring=scorers, refit='recall', n_jobs=-1)

In [46]:
grid_search_2.fit(X_train_up, y_train_up)

In [47]:
grid_search_2.best_params_

{'l2_regularization': 0.0,
 'learning_rate': 0.01,
 'max_bins': 50,
 'max_depth': 7,
 'max_leaf_nodes': 15,
 'random_state': 42}

In [51]:
best_model_2 = grid_search_2.best_estimator_

best_model_2.fit(X_train_up, y_train_up)

y_pred_best_2 = best_model_2.predict(X_test)

In [54]:
best_scores_2 = model_score(y_test, y_pred_best_2)

print("Scores after hypertuning of recall (with upscaled training data)")
display(best_scores_2)
print(confusion_matrix(y_test, y_pred_best_2))

Scores after hypertuning of recall (with upscaled training data)


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.308,0.93,0.72,0.431


[[1233   81]
 [  14   36]]
