# ЛАБОРАТОРНАЯ РАБОТА №5
## Исследование алгоритма градиентного бустинга

Импорт библиотек

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
mean_absolute_error, mean_squared_error, r2_score
)

import matplotlib.pyplot as plt
import seaborn as sns

### Классификация (Credit Card Default)

Загрузка датасета для классификации

In [2]:
data_cls = pd.read_csv('UCI_Credit_Card.csv')

data_cls.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


Формирование признаков и целевой переменной

In [3]:
target = 'default.payment.next.month'

X = data_cls.drop(columns=[target, 'ID'])
y = data_cls[target]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

Обучение Gradient Boosting (бейзлайн)

In [4]:
clf = GradientBoostingClassifier(
n_estimators=200,
learning_rate=0.05,
max_depth=3,
random_state=42
)

clf.fit(X_train, y_train)

Оценка качества модели классификации

In [5]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

accuracy, precision, recall, f1, roc_auc

(0.8195,
 0.6689750692520776,
 0.36397889977392617,
 0.47144948755490484,
 0.779743934555821)

### Регрессия(Air Quality)

Загрузка датасета для регрессии

In [6]:
data_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

data_reg.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


Очистка данных и подготовка признаков

In [7]:
empty_cols = data_reg.columns[data_reg.isna().all()]
data_reg = data_reg.drop(columns=empty_cols)

data_reg = data_reg.replace(-200, np.nan)
data_reg = data_reg.dropna()

target_reg = 'CO(GT)'

X = data_reg.drop(columns=[target_reg, 'Date', 'Time'])
y = data_reg[target_reg]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

Обучение Gradient Boosting Regressor (бейзлайн)

In [8]:
reg = GradientBoostingRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=3,
random_state=42
)

reg.fit(X_train, y_train)

Оценка качества регрессии

In [9]:
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

mae, mse, rmse, r2



(0.15961610584812697,
 0.0521598422031216,
 0.22838529331618881,
 0.9749429871639562)

## Улучшение бейзлайна (GridSearch + Scaling)

## Выполняется GridSearchCV с кросс-валидацией.
### Гипотеза: оптимальный подбор гиперпараметров улучшит F1-score модели.
### В качестве метрики оптимизации выбран F1-score, так как он наиболее устойчив при дисбалансе классов.

In [10]:
data_cls = pd.read_csv('UCI_Credit_Card.csv')
data_cls = data_cls.drop(columns=['ID'])

X = data_cls.drop(columns=['default.payment.next.month'])
y = data_cls['default.payment.next.month']

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Подбор гиперпараметров (классификация)

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [2, 3]
}

grid_clf = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

grid_clf.fit(X_train_scaled, y_train)

best_params = grid_clf.best_params_
print(f"Лучшие параметры: {best_params}")

best_clf = GradientBoostingClassifier(**best_params, random_state=42)
best_clf.fit(X_train_scaled, y_train)

y_pred = best_clf.predict(X_test_scaled)
y_proba = best_clf.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

accuracy, precision, recall, f1, roc_auc

Лучшие параметры: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}


(0.8195,
 0.6689750692520776,
 0.36397889977392617,
 0.47144948755490484,
 0.7795928316253756)

In [12]:
data_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

data_reg = data_reg.dropna(axis=1, how='all')

data_reg = data_reg.replace(-200, np.nan).dropna()

target_reg = 'CO(GT)'

X = data_reg.drop(columns=[target_reg, 'Date', 'Time'])
y = data_reg[target_reg]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Подбор гиперпараметров (регрессия)

In [13]:
param_grid = {
'n_estimators': [200, 300],
'learning_rate': [0.05, 0.1],
'max_depth': [2, 3]
}

grid_reg = GridSearchCV(
GradientBoostingRegressor(random_state=42),
param_grid,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1
)

grid_reg.fit(X_train_scaled, y_train)

grid_reg.best_params_

best_reg = grid_reg.best_estimator_
best_reg.fit(X_train_scaled, y_train)

y_pred = best_reg.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

mae, rmse, r2



(0.17137192376609575, 0.23553115595159357, 0.9733504577523071)

# Собственная реализация алгоритмов

Подготовка данных

In [14]:
data_cls = pd.read_csv('UCI_Credit_Card.csv')

X_cls = data_cls.drop(columns=['ID', 'default.payment.next.month'])
y_cls = data_cls['default.payment.next.month'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_cls, y_cls,
    test_size=0.2,
    random_state=42,
    stratify=y_cls
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
data_air = pd.read_csv(
    'AirQuality.csv',
    sep=';',
    decimal=','
)

data_air = data_air.dropna(axis=1, how='all')

data_air = data_air.replace(-200, np.nan).dropna()

target_air = 'CO(GT)'

X_air = data_air.drop(columns=[target_air, 'Date', 'Time'])
y_air = data_air[target_air].astype(float)

X_air_train, X_air_test, y_air_train, y_air_test = train_test_split(
    X_air, y_air,
    test_size=0.2,
    random_state=42
)

scaler_air = StandardScaler()
X_air_train_scaled = scaler_air.fit_transform(X_air_train)
X_air_test_scaled = scaler_air.transform(X_air_test)

## Имплементация Gradient Boosting (классификация)

In [16]:
from sklearn.tree import DecisionTreeRegressor
from scipy.special import expit

class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_value = None

    def fit(self, X, y):
        p = np.clip(np.mean(y), 1e-5, 1 - 1e-5)
        self.init_value = np.log(p / (1 - p))

        y_pred = np.full(len(y), self.init_value)

        for _ in range(self.n_estimators):
            prob = expit(y_pred)
            residuals = y - prob

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            y_pred += self.learning_rate * tree.predict(X)
            self.models.append(tree)

    def predict_proba(self, X):
        y_pred = np.full(X.shape[0], self.init_value)

        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)

        prob = expit(y_pred)
        return np.vstack([1 - prob, prob]).T

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] > 0.5).astype(int)


In [None]:
Обучим на стандартных данных

In [27]:
my_clf_base = MyGradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)

my_clf_base.fit(X_train.values, y_train.values)

In [28]:
y_pred_base = my_clf_base.predict(X_test.values)
y_proba_base = my_clf_base.predict_proba(X_test.values)[:, 1]

accuracy_base = accuracy_score(y_test, y_pred_base)
precision_base = precision_score(y_test, y_pred_base)
recall_base = recall_score(y_test, y_pred_base)
f1_base = f1_score(y_test, y_pred_base)
roc_auc_base = roc_auc_score(y_test, y_proba_base)

accuracy_base, precision_base, recall_base, f1_base, roc_auc_base

(0.8193333333333334,
 0.6810730253353204,
 0.3443858327053504,
 0.4574574574574575,
 0.765158470206195)

Обучение собственной модели классификации

In [22]:
my_clf = MyGradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3
)

my_clf.fit(X_train_scaled, y_train)


Оценка качества классификации

In [23]:
y_pred = my_clf.predict(X_test_scaled)
y_proba = my_clf.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

accuracy, precision, recall, f1, roc_auc


(0.8191666666666667,
 0.6811377245508982,
 0.34287867370007535,
 0.45614035087719296,
 0.7653358589185641)

## Имплементация Gradient Boosting (регрессия)

In [29]:
from sklearn.tree import DecisionTreeRegressor

class MyGradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_value = None

    def fit(self, X, y):
        self.init_value = np.mean(y)
        y_pred = np.full_like(y, self.init_value, dtype=float)

        for _ in range(self.n_estimators):
            residuals = y - y_pred

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            update = tree.predict(X)
            y_pred += self.learning_rate * update

            self.models.append(tree)

    def predict(self, X):
        y_pred = np.full(X.shape[0], self.init_value, dtype=float)

        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)

        return y_pred


Обучим на станадартных данных

In [30]:
my_reg_base = MyGradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)

my_reg_base.fit(X_air_train.values, y_air_train.values)


In [31]:
y_pred_base = my_reg_base.predict(X_air_test.values)

mae_base = mean_absolute_error(y_air_test, y_pred_base)
rmse_base = mean_squared_error(y_air_test, y_pred_base, squared=False)
r2_base = r2_score(y_air_test, y_pred_base)

mae_base, rmse_base, r2_base




(0.15998670940302445, 0.22558657231118712, 0.9755533408513579)

Обучение собственной модели регрессии

In [32]:
my_reg = MyGradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3
)

my_reg.fit(X_air_train_scaled, y_air_train)


Оценка качества регрессии

In [33]:
y_pred = my_reg.predict(X_air_test_scaled)

mae = mean_absolute_error(y_air_test, y_pred)
rmse = mean_squared_error(y_air_test, y_pred, squared=False)
r2 = r2_score(y_air_test, y_pred)

mae, rmse, r2




(0.16335066634389156, 0.234983590423007, 0.973474224040239)

## Выводы

Исходя из полученных результатов, видно, что классификация либо осталась такой же, либо не дала сильных улучшений

А вот регрессия наоборот более чувствительна к изменениям, улчшенный бейзлайн в среднем давал лучший рехультат

Собственная реализация не уступает, что доказывает ее эффективность