Подготовка данных

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
df = pd.read_csv('data\\healthcare-dataset-stroke-data.csv', index_col='id')

X = df.drop(columns=['stroke'])
y = df['stroke'].copy()

X['gender'] = X['gender'].map({'Male': 1, 'Female': 0, 'Other': 1})
X['Residence_type'] = X['Residence_type'].map({'Urban': 1, 'Rural': 0})
X['ever_married'] = X['ever_married'].map({'Yes': 1, 'No': 0})

X = pd.get_dummies(X, columns=['work_type', 'smoking_status'], dtype=float)
X['bmi'] = X['bmi'].fillna(value=X['bmi'].median())

# Train/val split
obj_ind = X.index.tolist()
np.random.shuffle(obj_ind)
num_train = int(0.8*len(obj_ind))

X_train, y_train = X.loc[obj_ind[:num_train], :], y[obj_ind[:num_train]]
X_val, y_val = X.loc[obj_ind[num_train:], :], y[obj_ind[num_train:]]

X_train.head()

Реализация Градиентного бустинга

In [None]:
class GradientBoosting:
    def __init__(self, num_trees=100, max_depth=2, lr=0.1):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.lr = lr
        self.trees_list = []

    @staticmethod
    def sigmoid(x):
        return (1/(1 + np.exp(-x)))
    
    @staticmethod
    def ce_loss(y_true, probs, get_grad=False):
        if get_grad:
            return -y_true/(probs+1e-8) + (1-y_true)/(1-probs+1e-8) 
        else:
            return -y_true*np.log(probs) - (1-y_true)*np.log(1-probs)

    def fit(self, X_train, y_train):
        # initial prediction
        preds_raw = np.zeros(len(X_train))
        probs = GradientBoosting.sigmoid(preds_raw)
        self.loss_train = []

        for ii in range(self.num_trees):
            # find anti-gradient
            anti_grad = -GradientBoosting.ce_loss(np.array(y_train), probs, get_grad=True)
            # train regressor on (X, -g)
            tree = DecisionTreeRegressor(max_depth=self.max_depth, criterion='squared_error')
            tree.fit(X_train, anti_grad)
            self.trees_list.append(tree)
            # calculate raw predictions and probabilities
            preds_raw += self.lr * np.array(tree.predict(X_train))
            probs = GradientBoosting.sigmoid(preds_raw)
            self.loss_train.append(
                np.mean(GradientBoosting.ce_loss(np.array(y_train), probs))
                )

    def predict(self, X_test):
        preds_raw = np.zeros(len(X_test))   
        for tree in self.trees_list:
            preds_raw += self.lr * np.array(tree.predict(X_test))

        probs = GradientBoosting.sigmoid(preds_raw)
        pred_labels = (np.array(probs) > 0.1).astype(int)

        return probs, pred_labels


In [None]:
import itertools
from sklearn.metrics import roc_auc_score

# hyperparameter tuning
grid = {
    'max_depth': list(range(1, 10, 1)),
    'num_trees': list(range(50, 200, 20)),
}
best_metr = 0.0
best_param = None
param_comb = list(itertools.product(*grid.values()))

for comb in param_comb:
    gbdt_custom = GradientBoosting(max_depth=comb[0], num_trees=comb[1])
    gbdt_custom.fit(X_train, y_train)
    preds_probs, preds_labels = gbdt_custom.predict(X_val)

    metr = roc_auc_score(y_val, preds_labels)
    metr_probs = roc_auc_score(y_val, preds_probs)

    # save best metric and parameters
    if metr_probs > best_metr:
        best_metr = metr_probs.copy()
        best_param = comb

    print("metr: " , metr)
    print("metr_prob: ", metr_probs)

print(best_metr, best_param)

In [None]:
# Train on best params
gbdt_custom = GradientBoosting(max_depth=best_param[0], num_trees=best_param[1])
gbdt_custom.fit(X_train, y_train)

Scikit Gradient Boosting

In [None]:
import itertools
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

best_metr = 0.0
best_param = None

# hyperparameter tuning
grid = {
    'max_depth': list(range(1, 10, 1)),
    'n_estimators': list(range(50, 200, 20)),
}
param_comb = list(itertools.product(*grid.values()))

for comb in param_comb:
    rf_scikit = GradientBoostingClassifier(max_depth=comb[0], n_estimators=comb[1])
    rf_scikit.fit(X_train, y_train)
    preds = rf_scikit.predict_proba(X_val)[:, 1]

    metr = roc_auc_score(y_val, preds)

    if metr > best_metr:
        best_metr = metr.copy()
        best_param = comb

    print(metr)

print(best_metr, best_param)