<a href="https://colab.research.google.com/github/Existanze54/sirius-machine-learning-2025/blob/main/Seminars/GenTech/S10_GBoost_GT25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Семинар 10. Деревья решений и случайный лес

In [None]:
! pip -q install catboost

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBoost

from xgboost import XGBClassifier as XGBoost
from lightgbm import LGBMClassifier as LGBoost
from catboost import CatBoostClassifier as CatBoost

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

from tqdm.auto import tqdm

In [None]:
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score)

def train_test_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return pd.DataFrame({'Accuracy': [acc],
                         'Precision': [prec],
                         'Recall': [rec],
                         'F1': [f1]})

### Сравним бустинги с лесом

In [None]:
data = load_breast_cancer(as_frame=True)

X = data.data
y = data.target
y = (y - 1).abs()

In [None]:
models = {
    'RF': RF(),
    'GB': GBoost(),
    'XGB': XGBoost(),
    'LGB': LGBoost(verbose=-1),
    'Cat': CatBoost(verbose=0),
}
scores = {}

for name, model in tqdm(models.items()):
    model.set_params(random_state=0)

    result = cross_validate(model, X, y,
                            scoring='f1', cv=10,
                            return_train_score=False)
    scores[name] = result['test_score']

In [None]:
df = pd.DataFrame(scores)
sns.boxplot(data=df, fill=False, orient='h')
plt.show()

### Тюнинг параметров бустинга

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [None]:
model = XGBoost(n_jobs=1, random_state=0)
train_test_model(model, X_train, X_val, y_train, y_val)

In [None]:
grid = {
    'n_estimators': randint(100, 501),
    'max_depth': randint(2, 11),

    'subsample': uniform(0.1, 0.9),
    'colsample_bytree': uniform(0.1, 0.9),

    'learning_rate': loguniform(1e-3, 2e-1),

}
searcher = RandomizedSearchCV(model, grid, n_iter=25,
                              random_state=42, scoring='f1')
searcher.fit(X_train, y_train)

pd.DataFrame(searcher.best_params_, index=['value']).round(2).T

In [None]:
model.set_params(**searcher.best_params_)
train_test_model(model, X_train, X_val, y_train, y_val)

### Построение ансамбля с ранней остановкой


In [None]:
model.set_params(n_estimators=2000,
                 early_stopping_rounds=1000)
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train),
              (X_val, y_val)],
    verbose=0,
)
hist = model.evals_result()

In [None]:
plt.plot(hist['validation_0']['logloss'])
plt.plot(hist['validation_1']['logloss'])

plt.ylim([0.01, 0.1])
plt.show()

### Обоработка категориальных признаков

In [None]:
! pip -q install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)
df = adult.data.original
df = df.dropna()

df

In [None]:
X = df.iloc[:, :-1]
y = df['income'].str.contains('>50')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

cat = X.select_dtypes(['object', 'category']).columns

ct = ColumnTransformer(
    [('cat', OrdinalEncoder(), cat)],
    remainder='passthrough'
)

X_train_encoded = ct.fit_transform(X_train)
X_val_encoded = ct.transform(X_val)

In [None]:
model = CatBoost(verbose=0, random_state=0)
train_test_model(model, X_train_encoded, X_val_encoded, y_train, y_val)

In [None]:
model = CatBoost(cat_features=list(cat),
                 verbose=0, random_state=0)
train_test_model(model, X_train, X_val, y_train, y_val)