In [28]:
import pandas as pd
import numpy as np
import random

In [29]:
class MyTreeReg:
    def __init__(self, max_depth = 5, min_samples_split = 2, max_leafs = 20, bins = None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.leafs_cnt = 0
        self.n_samples_ensemble = None

    def __repr__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"
    def _mse(self, y):
        y_mean = np.mean(y)
        return np.mean((y - y_mean) ** 2)

    def _get_best_split(self, X, y):
        
        N, n_features = X.shape
        I_p = self._mse(y)
        gain = 0
        split_value = 0
        col_index  = None
        for j in range(n_features):
            thresholds = self.global_thresholds_[j]
            for t in thresholds:
                mask_left = X[:, j] <= t
                mask_right = X[:, j] > t
                if mask_left.sum() == 0 or mask_right.sum() == 0:
                    continue
                X_r, y_r = X[mask_right], y[mask_right]
                X_l, y_l = X[mask_left], y[mask_left]
                
                I_r, I_l = self._mse(y_r), self._mse(y_l)
                N_r, N_l = len(y_r), len(y_l)
                
                IG = I_p - N_r / N * I_r - N_l/N * I_l
                if IG > gain:
                    gain = IG
                    split_value = t
                    col_index = j
                        
        return col_index, split_value, gain
        
    def _build_tree(self, X_train, y_train, feature_names, idx, depth = 0):
        
        stop_reasons = []
        if depth >= self.max_depth:
                stop_reasons.append("max_depth")
        
        if len(np.unique(y_train)) == 1:
            stop_reasons.append("pure_node")
    
        if len(y_train) == 1:
            stop_reasons.append("single_sample")
    
        if len(y_train) < self.min_samples_split:
            stop_reasons.append("min_samples_split")
    
        if self.potential_leafs >= self.max_leafs:
            stop_reasons.append("max_leafs")
            
        if stop_reasons:
            pred = np.mean(y_train)
            self.pred_sum += pred
            self.leafs_cnt += 1                      
            return {
                "type" : "leaf",
                "prediction" : pred,
                "n_samples" : len(y_train),
                "depth" : depth,
                "indices" : idx
                }
        
        best_feature, best_split, ig  = self._get_best_split(X_train, y_train)
        
        if ig <= 0:
            pred = np.mean(y_train)
            self.pred_sum += pred
            self.leafs_cnt += 1
            return {
                "type" : "leaf",
                "prediction" : pred,
                "n_samples" : len(y_train),
                "depth" : depth,
                "indices" : idx
            }
        n_samples_node = len(y_train)
        fn = feature_names[best_feature]
        self.fi[fn] += n_samples_node / self.n_samples_ensemble * ig
        self.potential_leafs += 1
        
        mask_left = X_train[:, best_feature] <= best_split
        mask_right = X_train[:, best_feature] > best_split
        X_r, y_r = X_train[mask_right], y_train[mask_right]
        X_l, y_l = X_train[mask_left], y_train[mask_left]

        idx_l = idx[mask_left]
        idx_r = idx[mask_right]
        
        left_subtree = self._build_tree(X_l, y_l, feature_names,  idx_l, depth +1)
        right_subtree = self._build_tree(X_r, y_r, feature_names, idx_r, depth +1)
        
        return {
            "type" : "node",
            'feature' : best_feature,
            'split' : best_split,
            'feature_name' : fn,
            'depth' : depth,
            "n_samples" : n_samples_node,
            'leaf_left' : left_subtree,
            'leaf_right' : right_subtree,        
        }
    def fit(self, X, y, n_samples_ensemble, feature_names = None):
        if isinstance(X, pd.DataFrame):
            feature_names = X.columns.to_list()
            X_train = X.to_numpy()
        else:
            X_train = np.asarray(X)

        self.n_samples_ensemble = n_samples_ensemble
        
        if isinstance(y, pd.DataFrame):
            y_train = y.to_numpy()
        else:
            y_train = np.asarray(y)
        self.f_names = feature_names
        if self.max_leafs < 2:
            self.max_leafs = 2
        self.pred_sum = 0
        self.potential_leafs = 1
        n_samples = X.shape[0]
        self.fi = {f : 0 for f in feature_names}
        
        self.global_thresholds_ = []
        for j in range(X_train.shape[1]):
            features = X_train[:, j]
            f = np.sort(np.unique(features))
            native_thresholds = (f[:-1] + f[1:]) / 2 
            if self.bins is None:
                thresholds = native_thresholds
            else:
                if self.bins - 1 > len(native_thresholds):
                    thresholds = native_thresholds
                else:
                    thresholds = np.histogram(X_train[:, j], self.bins)[1][1:-1]
            self.global_thresholds_.append(thresholds)
        y_indices = np.arange(len(y_train))    
        self.tree_ = self._build_tree(X_train, y_train, feature_names, y_indices, depth = 0)
               
            
    def print_tree(self, node = None, path = "1", side = None):
        if node is None:
            node = self.tree_
        if node["type"] == "leaf":
            if side is not None:
                print(' '*node['depth'], f"{path}.{side} - {node['prediction']}")
            else:
                print(f"{path} - {node['prediction']}")
            return
        feature = node["feature_name"]
        split = node["split"]
        depth = node['depth']
        print(' '*depth, f"{path} - {feature} > {split}")
        self.print_tree(node["leaf_left"], path + ".1", side = "left") 
        self.print_tree(node["leaf_right"], path + ".2", side = "right")
    
    def predict(self, X, feature_names = None):
        if isinstance(X, pd.DataFrame):
            if feature_names is None:
                feature_names = X.columns.to_list()
            X_test = X.to_numpy()
        else:
            X_test = np.asarray(X)
            if feature_names is None:
                feature_names = self.f_names
        n_samples = X.shape[0]
        preds = np.zeros(n_samples)
        for i in range(n_samples):
            node = self.tree_
            while node["type"] != "leaf":
                feature_name = node['feature_name']
                feature_number = feature_names.index(feature_name)
                predicat = node['split']
                if X_test[i, feature_number] <= predicat:
                    node = node['leaf_left']
                else:
                    node = node['leaf_right']
            result = node['prediction']
            preds[i] = result
        return preds

In [30]:
class MyBoostReg:
    def __init__(self, n_estimators = 10, learning_rate = 0.1, max_depth = 5, 
                 min_samples_split = 2, max_leafs  = 20, bins = 16, loss = "MSE", 
                 metric = None, max_features = 0.5, max_samples = 0.5, random_state = 42,
                 reg = 0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.loss = loss.upper()
        self.metric = metric
        self.trees = []
        self.best_score = None
        self.max_features = max_features
        self.max_samples = max_samples
        self.random_state = random_state
        self.reg = reg
        self.leaf_count = 0
       
        self.np_improve_count = 0
        
    def __repr__(self):
        return f"""MyBoostReg class: n_estimators={self.n_estimators}, learning_rate={self.learning_rate}, 
                max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, 
                bins={self.bins}"""
    def _calculate_metric(self, y_true, y_pred):
        
        if self.metric == "MAE":
            return np.meean(np.abs(y_true - y_pred))
        elif self.metric == "MSE":
            return np.mean((y_true - y_pred) ** 2)
        elif self.metric == "RMSE":
            return np.sqrt(np.mean((y_true - y_pred) **2))
        elif self.metric == "MAPE":
            return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        elif self.metric == "R2":
            y_mean = np.mean(y_true)
            rss = np.sum((y_true - y_pred) ** 2)
            tss = np.sum((y_true - y_mean) ** 2)
            return 1 - rss/tss
        else:
            raise ValueError("Неправильная метрика")
    def _calculate_loss(self, y_true, y_pred):
        if self.loss == "MSE":
            return np.mean((y_true - y_pred)**2)
        elif self.loss == "MAE":
            return np.mean(np.abs(y_true - y_pred))
        else: raise ValueError("Неправильный лосс")

    def _update_leaf_predictions(self, node, y_true, y_pred):
        if node["type"] == "leaf":
            indices = node['indices']
            y_true_sample = y_true[indices]
            y_pred_sample = y_pred[indices]
            residuals = y_true_sample - y_pred_sample
            if self.loss == "MSE":
                node['prediction'] = np.mean(residuals) + self.leaf_count * self.reg
            elif self.loss == "MAE":
                node['prediction'] = np.median(residuals) + self.leaf_count * self.reg
        else:
            self._update_leaf_predictions(node['leaf_left'], y_true, y_pred)
            self._update_leaf_predictions(node['leaf_right'], y_true, y_pred)
           
    def _get_learning_rate(self, iteration):
        if callable(self.learning_rate):
            return self.learning_rate(iteration)
        else:
            return self.learning_rate
            
    def fit(self, X, y, X_eval = None, y_eval = None, early_stopping = None, verbose = None):
        random.seed(self.random_state)
        self.n_samples, self.n_features = X.shape
        self.feature_names = list(X.columns)
        X_train = X.to_numpy()
        y_train = y.to_numpy()
        
        self.fi = {f : 0 for f in self.feature_names}
        if self.loss == "MSE":
            self.pred_0 = np.mean(y_train)
        elif self.loss == "MAE":
            self.pred_0 = np.median(y_train)
        else: raise ValueError("Неправильный лосс")
          
        predictions = np.full(self.n_samples, self.pred_0)

        self.tree_learning_rates = []

        use_early_stopping = (
           early_stopping is not None and 
           X_eval is not None and 
           y_eval is not None
        )
        if use_early_stopping: 
            if self.metric == "R2":
                best_eval_score = 0
            else: best_eval_score = np.inf
            no_improve_count = 0
        
        for i in range(self.n_estimators):
            current_lr = self._get_learning_rate(i + 1)
            self.tree_learning_rates.append(current_lr)
            if self.loss == "MSE":
                grad = 2 * (predictions - y_train)
               
            elif self.loss == "MAE":
                grad = np.sign(predictions - y_train)
         
            else: raise ValueError("Неправильный лосс")
 
            cols_idx = random.sample(range(self.n_features), round(self.n_features * self.max_features))
            rows_idx = random.sample(range(self.n_samples), round(self.n_samples * self.max_samples))
            
            X_train_sample = X_train[rows_idx][:, cols_idx]
            grad_sample = grad[rows_idx]
            feature_names_sample = [self.feature_names[k] for k in cols_idx]
            
            tree = MyTreeReg(max_depth = self.max_depth, min_samples_split = self.min_samples_split, max_leafs = self.max_leafs, bins = self.bins)
            tree.fit(X_train_sample, -grad_sample, self.n_samples, feature_names_sample)
                 
                
            self._update_leaf_predictions(tree.tree_, y_train[rows_idx], predictions[rows_idx])
            self.trees.append(tree)
            self.leaf_count += tree.leafs_cnt

            y_pred_tree = tree.predict(X)
            predictions += y_pred_tree * current_lr
            if use_early_stopping:
                y_pred_eval = self.predict(X_eval)
                if self.metric:
                    eval_score = self._calculate_metric(y_eval, y_pred_eval)
                else:
                    eval_score = self._calculate_loss(y_eval, y_pred_eval)
                if self.metric == 'R2':
                    improve = eval_score > best_eval_score
                else:
                    improve = eval_score <= best_eval_score
                    
                if improve:
                    best_eval_score = eval_score
                    no_improve_count = 0
                else:
                    no_improve_count += 1

                if no_improve_count >= early_stopping:
                    print(f"Early stopping: остановка на итерации {i}")
                    self.trees = self.trees[:-early_stopping]
                    break
                    
            loss_t = self._calculate_loss(y_train, predictions)
            for item in tree.fi.items():
                self.fi[item[0]] += item[1]
                
            if verbose and i % verbose == 0:
                if self.loss == "MSE":
                    print(f"{i}. Loss[MSE]: {round(loss_t, 2)}", f"|{self.metric}: {self._calculate_metric(y_train, predictions)}" if self.metric else "",
                          f"|eval loss: {eval_score}" if use_early_stopping else "")
                elif self.loss == "MAE":
                    print(f"{i}. Loss[MAE]: {round(loss_t, 2)}", f"|{self.metric}: {self._calculate_metric(y_train, predictions)}" if self.metric else "",
                          f"|eval loss: {eval_score}" if use_early_stopping else "")
                else: raise ValueError("Неправильный лосс")
        if use_early_stopping: 
            self.best_score = best_eval_score
        else:
            if self.metric:
                self.best_score = self._calculate_metric(y_train, predictions)
            else:
                self.best_score = loss_t


    def predict(self, X):
        total_preds =  np.full(X.shape[0], self.pred_0)
        for i in range(len(self.trees)):
            tree = self.trees[i]
            tree_lr = self.tree_learning_rates[i]
            pred = tree.predict(X, self.feature_names)
            total_preds += tree_lr * pred
        return total_preds     
   

In [31]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1500, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
X_test = np.array([
    [-0.24795421,  0.57556193,  0.24889397, -0.42903725, 0.2321945,   0.65863195,
     -0.47996815, 0.41687306, -0.29505619, -0.46285629, -1.70478591, 0.99541519,
     -0.39115637, -0.51598099],
    [ 1.0062508,  -0.51077719,  0.65882024,  0.4249196,  0.35356615, -0.79067364,
       2.34030022, 1.67618951,  0.08796428,  0.9104611,  1.6766508,  0.73247653,
      -0.22888016, -0.65677036],
    [-1.13511942, -0.97918044,  0.51613497, -0.50122901, 0.64391301, 0.43917042,
      -0.86673023, -2.13962756, 0.07968098, -0.14346973, -1.02074021, -1.79807836,
       1.52988349, 0.20259852],
    [-0.54683846, -0.60451386,  0.35833407, -1.85158683, -0.16624207, 0.8720902,
      -0.98785807, 0.32762622, 1.74449555, -0.06136604, 0.81854872, 2.67962869,
      -2.12864912, 1.27084562],
    [ 0.90267814, -2.09818746, -2.91180394, 0.77493611, -0.58684268, 2.1482392,
       0.32363247, 1.68266291, 1.32188099, -0.04705854, 2.01919925, -1.99998656,
      -1.44464125, -0.02133238],
    [ 0.71271203,  1.27886581,  2.28165184, -1.53504025, 0.42544486, 0.18517554,
      -1.88315021, 0.43731319, -0.95231638, -0.67699514, -1.88001003, 0.22228986,
      -0.61764168, -0.37231905],
    [-1.63824566, -0.08857107, -0.17963037, -1.57722837, 0.87172442, -0.51068394,
      -0.25959672, -0.50544721, 0.68471485, -1.07695454, -0.85019775, 1.06854956,
       0.77913063, 1.2114275],
    [ 0.36300428,  0.02979617, -1.53268397, 0.972444, -0.58372213, 1.17653325,
      -1.72107752, -1.22499989, -0.64015446, 1.48374391, 0.87278741, -1.08574771,
      -1.12859067, -0.35840358],
    [ 1.17390093,  1.39104248, -0.28758415, -0.13778586, 0.07310425, -0.9486219,
      -1.01803767, 0.4136479, -1.13878294, -0.65389709, -0.83560124, -0.90963217,
      -0.2067544, 0.31815933],
    [-0.29951504, -1.43135485,  1.29914982, -0.71406512, 0.01252186, 0.16524005,
      -1.23570302, 1.4381692, -1.28559911, -1.61584647, 1.17841932, 1.63721164,
       0.66754808, -0.17204136],
    [ 1.4521177,  -0.2215366,   0.79185931, 0.85246864, -0.58445529, 0.80891927,
       0.01414926, -0.29918177, -1.17696211, 1.82054391, -1.2291836, -0.74935769,
      -0.57064043, -0.59641141],
    [ 0.91260231, -0.34072077, -0.19230318, -0.29251619, 0.59515732, -0.61191305,
       0.89862787, -1.03996815, 0.45583768, -0.55605256, 0.67099265, 2.44881231,
       0.5446125, 0.11784433],
    [ 0.34616582, 0.81432285, -0.11718903, 0.95359495, 1.57861539, -0.01511359,
       0.06839386, -0.12536699, -0.19861572, -1.00254196, -0.59216633, -0.65822224,
      -2.02503631, 1.53839498],
    [-0.75134547, -0.2724269, 0.20713944, 1.47812123, -0.69380442, 2.05124694,
       1.52009174, 0.88484258, -1.954593, -0.3103968, -1.48239679, -0.09931434,
      -0.23927315, 0.13111932],
    [-0.43585735, -0.96775234, 1.21600654, -0.13057546, -0.94277524, -0.60599005,
       0.46432056, 1.08326178, 0.5644982, -1.1617513, 0.46133071, -0.30757739,
      -0.71932169, 0.55539886],
    [-0.71726391, -1.13918212, 0.29658384, 0.6952675, -1.6540355, 0.69843278,
      -0.11417027, -0.05261187, -0.30737032, 1.24394192, 1.12728112, 1.10937157,
       0.42656791, -0.61014975],
    [ 0.87896397, 0.96476989, -0.78235392, 0.24748055, -1.30110927, -0.42963498,
       0.48060859, -0.45985926, 1.29998737, 0.96030517, -0.00966442, -0.42898072,
       0.85316531, 1.44656329],
    [-0.78531792, -1.22988312, -0.64489586, -0.52370428, 1.83413456, 1.17720879,
       1.94896311, 0.92590418, -0.80372366, -0.4697298, -1.36579593, -1.77142255,
      -0.72943705, 0.73565514],
    [-0.69193084, 0.23140635, -1.49828181, -0.17732279, -0.32674459, 1.74085808,
       0.48996231, 0.29979617, 0.46496165, 0.22794726, 0.2528928, -0.26288247,
       2.33999632, 0.96006087],
    [-1.0633545, -1.34799578, 1.2371808, -0.45802518, 0.75950004, -0.90968979,
       1.02162476, -1.05005479, -0.60568, -0.03204237, 0.47533493, -0.61677536,
      -0.25447608, -1.05598981]
])

In [32]:
X_eval = X.tail(300)
y_eval = y.tail(300)
X_train = X.iloc[:-300]
y_train = y.iloc[:-300]

In [33]:
%%time
gb = MyBoostReg( learning_rate = 1.6, n_estimators = 8, max_depth = 5, min_samples_split = 2, max_leafs = 20, bins = 16, 
                max_samples = 1, max_features = 1, reg = 0.01, metric = "RMSE")
gb.fit(X_train, y_train, verbose = 1)
print(gb.best_score)

0. Loss[MSE]: 27914.02 |RMSE: 167.07488104018276 
1. Loss[MSE]: 22126.04 |RMSE: 148.74825418077944 
2. Loss[MSE]: 18846.78 |RMSE: 137.2835689917619 
3. Loss[MSE]: 17180.32 |RMSE: 131.0737110760486 
4. Loss[MSE]: 15947.66 |RMSE: 126.28404577932763 
5. Loss[MSE]: 14857.49 |RMSE: 121.89129599227245 
6. Loss[MSE]: 14075.4 |RMSE: 118.63979868883578 
7. Loss[MSE]: 13413.24 |RMSE: 115.81555371567671 
115.81555371567671
CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.19 s


In [7]:
gb.fi

{'col_0': 1629.0731484255657,
 'col_1': 2533.3298525139107,
 'col_2': 9716.39290175655,
 'col_3': 42207.35337681094,
 'col_4': 17164.269130111385,
 'col_5': 932.3215024500184,
 'col_6': 3719.3150569225645,
 'col_7': 31109.543426853976,
 'col_8': 1420.5224863703252,
 'col_9': 795.1360605012211,
 'col_10': 25370.968866817315,
 'col_11': 856.6601565233777,
 'col_12': 7270.152357979473,
 'col_13': 27264.5194643781}

In [98]:
class MyBoostClf:
    def __init__(self, n_estimators = 10, learning_rate = 0.1, max_depth = 3, 
                 min_samples_split = 2, max_leafs  = 20, bins = 16, metric = None, 
                max_features = 0.5, max_samples = 0.5, random_state  = 42, reg = 0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.trees = []
        self.metric = metric
        self.max_features = max_features
        self.max_samples = max_samples
        self.random_state = random_state
        self.reg = reg
        self.n_leaves = 0
        self.fi = {}

    def __repr__(self):
        return f"""MyBoostReg class: n_estimators={self.n_estimators}, learning_rate={self.learning_rate}, 
                max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, 
                bins={self.bins}"""
        
    def _confusion_matrix(self, y_true, y_pred):
        pred = np.where(y_pred > 0.5, 1, 0)
        
        tp = np.sum((y_true == 1) & (pred == 1))
        tn = np.sum((y_true == 0) & (pred == 0))
        fp = np.sum((y_true == 0) & (pred == 1))
        fn = np.sum((y_true == 1) & (pred == 0))
        return tp, tn, fp, fn
    def _accuracy(self, y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred) 
        return (tp + tn) / (tp + tn + fp + fn)
    def _precision(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return tp / (tp + fp)
    def _recall(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return tp / (tp + fn)
    def _f1(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return 2 * tp / (2 * tp + fp + fn)
        
    def _roc_auc(self, y_true, y_pred):
        order = np.argsort(y_pred)
        y_pred_sorted = y_pred[order]
        y_true = y_true[order]
        ranks = np.zeros_like(y_pred_sorted, dtype=float)
    
        i = 0
        rank = 1
        while i < len(y_pred_sorted):
            j = i
            while j < len(y_pred_sorted) and y_pred_sorted[i] == y_pred_sorted[j]:
                j += 1
            avg_rank = (rank+(rank + (j- i) - 1)) / 2
            ranks[i:j] = avg_rank
            rank += (j-i)
            i = j
        p = np.sum(y_true == 1)
        n = np.sum(y_true == 0)
        R_pos = np.sum(ranks[y_true == 1])
        auc = (R_pos -p * (p + 1)/ 2) / (p * n)
        return auc
        
    def _calculate_metric(self, y_true, y_pred):
         if self.metric == "accuracy":
            return self._accuracy(y_true, y_pred)
         elif self.metric == "precision":
            return self._precision(y_true, y_pred)
         elif self.metric == "recall":
            return self._recall(y_true, y_pred)
         elif self.metric == "f1":
            return self._f1(y_true, y_pred)
         elif self.metric == "roc_auc":
            return self._roc_auc(y_true, y_pred)
         else:
            raise ValueError("Нет такой метрики!")      
    def _calculate_loss(self, y, p):
        eps = 1e-15 
        log_loss = - np.mean(y * np.log(p + eps) + (1 - y) * np.log(1-p +eps))
        return log_loss
        
    def _get_learning_rate(self, iteration):
        if callable(self.learning_rate):
            return self.learning_rate(iteration)
        else:
            return self.learning_rate        
        
    def _update_leaf_predictions(self, node, y_true, y_pred):
        if node["type"] == "leaf":
            indices = node['indices']
            y_true_sample = y_true[indices]
            y_pred_sample = y_pred[indices]
            residuals = y_true_sample - y_pred_sample
            gamma = np.sum(residuals) / np.sum(y_pred_sample * (1 - y_pred_sample)) + self.n_leaves * self.reg
            node['prediction'] = gamma 
        
        else:
            self._update_leaf_predictions(node['leaf_left'], y_true, y_pred)
            self._update_leaf_predictions(node['leaf_right'], y_true, y_pred)
            
    def fit(self, X, y, X_eval = None, y_eval = None, early_stopping = None, verbose = None):
        random.seed(self.random_state)
        eps = 1e-15 
        self.feature_names = list(X.columns)
        self.n_samples, self.n_features = X.shape
        X_train = X.to_numpy()
        y_train = y.to_numpy()
        p0 = np.mean(y_train)
        self.pred_0 = np.log(p0 / (1- p0) + eps)
        self.fi = {f : 0.0 for f in self.feature_names}
        predictions = np.full(self.n_samples, self.pred_0)
        self.tree_learning_rates = []
        
        use_early_stopping = (
           early_stopping is not None and 
           X_eval is not None and 
           y_eval is not None
        )
        if use_early_stopping:
            if self.metric:
                best_eval_score = 0
            else:
                best_eval_score = np.inf
            no_improve_count = 0
        
        for i in range(self.n_estimators):
            iteration = i + 1
            
            current_lr = self._get_learning_rate(iteration)
            self.tree_learning_rates.append(current_lr)
            
            p_pred = 1/ (1 + np.exp(-predictions))
            grad = p_pred - y_train

            cols_idx = random.sample(range(self.n_features), round(self.n_features * self.max_features))
            rows_idx = random.sample(range(self.n_samples), round(self.n_samples * self.max_samples))
            
            X_train_sample = X_train[rows_idx][:, cols_idx]
            grad_sample = grad[rows_idx]
            feature_names_sample = [self.feature_names[k] for k in cols_idx]
            
            tree = MyTreeReg(max_depth = self.max_depth, min_samples_split = self.min_samples_split, max_leafs = self.max_leafs, bins = self.bins)
            tree.fit(X_train_sample, -grad_sample, self.n_samples, feature_names_sample)
            self._update_leaf_predictions(tree.tree_, y_train[rows_idx], p_pred[rows_idx])
            self.n_leaves += tree.leafs_cnt
    
            self.trees.append(tree)
            tree_pred = tree.predict(X)
            predictions += tree_pred * current_lr
            p_model = 1 / (1 + np.exp(-predictions))
            loss = self._calculate_loss(y_train, p_model)

            for item in tree.fi.items():
                self.fi[item[0]] += item[1]
                
            if self.metric:
                clf_metric = self._calculate_metric(y_train, p_model)
                
            if use_early_stopping:
                y_pred_eval = self.predict_proba(X_eval)
                if self.metric:
                    eval_score = self._calculate_metric(y_eval, y_pred_eval)
                    improve = eval_score > best_eval_score
                else:
                    eval_score = self._calculate_loss(y_eval, y_pred_eval)
                    improve = eval_score <= best_eval_score
            
                if improve:
                    best_eval_score = eval_score
                    no_improve_count = 0
                else:
                    no_improve_count += 1
                    
                if no_improve_count >= early_stopping:
                    print(f"Early stopping: остановка на итерации {i}")
                    self.trees = self.trees[:-early_stopping]
                    break
                    
            if verbose and i % verbose == 0:
                print(f"{i}. Loss: {round(loss, 2)}", f": {self.metric}: {clf_metric}" if self.metric else "",
                      f"|eval loss: {eval_score}" if use_early_stopping else "")

        if use_early_stopping: 
            self.best_score = best_eval_score
        else:
            if self.metric:
                self.best_score = self._calculate_metric(y_train, predictions)
            else:
                self.best_score = loss
                    
                
    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X_test = X.to_numpy()
            test_features = list(X.columns)
        else:
            X_test = np.asarray(X)
            test_features = self.feature_names
        total_preds =  np.full(X_test.shape[0], self.pred_0)
        
        for i in range(len(self.trees)):
            tree_lr = self.tree_learning_rates[i]
            tree = self.trees[i]
            pred = tree.predict(X_test, test_features)
            total_preds += tree_lr * pred
        probas =  1 / (1 + np.exp(-total_preds))
        return probas
        
    def predict(self, X):
       predict_probas = self.predict_proba(X)
       y_pred = (predict_probas >= 0.5).astype(int)
       return y_pred
            


In [99]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1500, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [100]:
X_test, y_test = make_classification(n_samples=30, n_features=14, n_informative=10, random_state=42)

In [101]:
X_eval = X.tail(300)
y_eval = y.tail(300)
X_train = X.iloc[:-300]
y_train = y.iloc[:-300]

In [108]:
gbr = MyBoostClf(n_estimators = 20, max_features=0.3, max_samples=0.3, min_samples_split = 2, max_depth = 3, metric = 'f1')
gbr.fit(X, y, X_eval, y_eval, 2, verbose = 1)

0. Loss: 0.66 : f1: 0.6776586974443528 |eval loss: 0.7218045112781954
1. Loss: 0.65 : f1: 0.7827145465611686 |eval loss: 0.8070175438596491
2. Loss: 0.63 : f1: 0.7479338842975206 |eval loss: 0.7676767676767676
Early stopping: остановка на итерации 3


In [89]:
gbr.pred_0

0.005333345975363486

In [90]:
y_pred = gbr.predict_proba(X_test)

In [91]:
gbr.best_score

0.8070175438596491

In [37]:
gbr.trees[0].leafs_cnt

8

In [50]:
gbr.fi

{'col_0': 0.003024505842456007,
 'col_1': 0.026517146387864423,
 'col_2': 0.002917065845528964,
 'col_3': 0.008408493663290611,
 'col_4': 0.0031436331498335426,
 'col_5': 0,
 'col_6': 0,
 'col_7': 0,
 'col_8': 0,
 'col_9': 0.0010499354993329425,
 'col_10': 0.019389246946970466,
 'col_11': 0.0005537030288300768,
 'col_12': 0.004554867560909502,
 'col_13': 0.002774837724111546}