Подготовка данных

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

np.random.seed(42)
df = pd.read_csv('data\\healthcare-dataset-stroke-data.csv', index_col='id')

X = df.drop(columns=['stroke'])
y = df['stroke'].copy()

X['gender'] = X['gender'].map({'Male': 1, 'Female': 0, 'Other': 1})
X['Residence_type'] = X['Residence_type'].map({'Urban': 1, 'Rural': 0})
X['ever_married'] = X['ever_married'].map({'Yes': 1, 'No': 0})

X = pd.get_dummies(X, columns=['work_type', 'smoking_status'], dtype=float)

# Train/val split
obj_ind = X.index.tolist()
np.random.shuffle(obj_ind)
num_train = int(0.8*len(obj_ind))

X_train, y_train = X.loc[obj_ind[:num_train], :], y[obj_ind[:num_train]]
X_val, y_val = X.loc[obj_ind[num_train:], :], y[obj_ind[num_train:]]

X_train.head()

Реализация случайного леса

In [None]:
def get_bootstrap_sample(X, y):
    obj_ind = X.index.tolist()
    obj_ind_bootstrap = np.random.choice(obj_ind, size=len(obj_ind), replace=True)
    # oob - out of bag error
    obj_ind_oob = np.array(list(set(obj_ind) - set(obj_ind_bootstrap)))

    return X.loc[obj_ind_bootstrap, :], y[obj_ind_bootstrap], X.loc[obj_ind_oob, :], y[obj_ind_oob]

In [None]:
class RandomForest:
    def __init__(self, num_trees=100, max_depth=20):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.random_forest = []

    def fit(self, X_train, y_train):
        oob_error = []
        for ii in range(self.num_trees):
            # get bootstrap data
            X_bootsrap, y_bootsrap, X_oob, y_oob = get_bootstrap_sample(X_train, y_train)
            # train tree
            tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features='sqrt', class_weight='balanced')
            tree.fit(X_bootsrap, y_bootsrap)
            self.random_forest.append(tree)
            # oob error
            oob_pred = tree.predict(X_oob)
            oob_error.append(np.sum(oob_pred != y_oob) / len(y_oob))

        print("OOB error: ", np.mean(oob_error))

    def predict(self, X_test):
        preds_cls = []
        preds = np.array([tree.predict(X_test) for tree in self.random_forest]).T
        for votes in preds:
            # votes: [0,1,0,0,0,1,...,0,1]
            _, counts = np.unique(votes, return_counts=True)
            # counts: [10, 7]
            preds_cls.append(np.argmax(counts))

        return np.array(preds_cls)


In [None]:
import itertools
from sklearn.metrics import roc_auc_score

# hyperparameter tuning
grid = {
    'max_depth': list(range(3, 15, 2)),
    'num_trees': list(range(50, 200, 20)),
}
best_metr = 0.0
best_param = None
param_comb = list(itertools.product(*grid.values()))

for comb in param_comb:
    rf_custom = RandomForest(max_depth=comb[0], num_trees=comb[1])
    rf_custom.fit(X_train, y_train)
    preds = rf_custom.predict(X_val)

    metr = roc_auc_score(y_val, preds)

    # save best metric and parameters
    if metr > best_metr:
        best_metr = metr.copy()
        best_param = comb

    print(metr)

print(best_metr, best_param)

In [None]:
# Train on best params
rf_custom = RandomForest(max_depth=best_param[0], num_trees=best_param[1])
rf_custom.fit(X_train, y_train)

rf_custom.predict(X_val)

Scikit Random Forest

In [None]:
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

best_metr = 0.0
best_param = None

# hyperparameter tuning
grid = {
    'max_depth': list(range(1, 15, 2)),
    'n_estimators': list(range(10, 200, 20)),
}
param_comb = list(itertools.product(*grid.values()))

for comb in param_comb:
    rf_scikit = RandomForestClassifier(max_depth=comb[0], n_estimators=comb[1], class_weight='balanced', max_features='sqrt')
    rf_scikit.fit(X_train, y_train)
    preds = rf_scikit.predict(X_val)

    metr = roc_auc_score(y_val, preds)

    if metr > best_metr:
        best_metr = metr.copy()
        best_param = comb

    print(metr)

print(best_metr, best_param)

Метрики для каждого отдельного дерева из ансамбля

In [None]:
mean_metr = []
for tree in rf_custom.random_forest:
    metr = roc_auc_score(y_val, tree.predict(X_val))
    print(metr)
    mean_metr.append(metr)

print("Mean metr: ", np.mean(mean_metr))