# Лабораторная работа №4 "Проведение исследований со случайным лесом"

### Ход работы

Импортируем библиотеки перед работой

In [57]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import randint
from sklearn.exceptions import ConvergenceWarning
from sklearn.tree import DecisionTreeRegressor
from scipy.special import softmax


##### Создание бейзлайна для модели классификации

Проведём те же манипуляции, что и ранне: выгрузим датасет и минимально его обработаем

In [16]:
c_base_df = pd.read_csv("../classification.csv").sample(frac=1, random_state=42).reset_index(drop=True)

c_base_df = c_base_df.drop(columns=['instance_id', 'obtained_date', 'track_name', 'artist_name'])

c_base_df.drop_duplicates()

c_base_df['tempo'] = pd.to_numeric(c_base_df['tempo'], errors='coerce')

le = LabelEncoder()
c_base_df['mode'] = le.fit_transform(c_base_df['mode'])
c_base_df['music_genre'] = le.fit_transform(c_base_df['music_genre'])
c_base_df['key'] = le.fit_transform(c_base_df['key'])

median_tempo = c_base_df['tempo'].median()
c_base_df['tempo'] = c_base_df['tempo'].fillna(median_tempo)

X_c_base = c_base_df.drop(columns=["music_genre"])
y_c_base = c_base_df["music_genre"]

X_c_base_train, X_c_base_test, y_c_base_train, y_c_base_test = train_test_split(
    X_c_base,
    y_c_base,
    test_size=0.2,
    random_state=42,
    stratify=y_c_base
)

Теперь обучим модель из sklearn

In [3]:
gb = GradientBoostingClassifier(
    n_estimators=20,
    learning_rate=0.8,
    max_depth=1,
    min_samples_split=50,
    subsample=0.6,
    random_state=42
)

gb.fit(X_c_base_train, y_c_base_train)
y_pred = gb.predict(X_c_base_test)

accuracy = accuracy_score(y_c_base_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5455


##### Улучшение бейзлайна для модели классификации

In [4]:
print(classification_report(y_c_base_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.34      0.38      1000
           1       0.72      0.68      0.70      1000
           2       0.55      0.49      0.52      1000
           3       0.82      0.76      0.79      1000
           4       0.55      0.59      0.57      1000
           5       0.58      0.57      0.58      1000
           6       0.45      0.45      0.45      1000
           7       0.47      0.45      0.46      1000
           8       0.43      0.40      0.41      1000
           9       0.49      0.72      0.58      1000

    accuracy                           0.55     10000
   macro avg       0.55      0.55      0.54     10000
weighted avg       0.55      0.55      0.54     10000



Теперь преобразуем датасет как ранее

In [34]:
c_df = pd.read_csv("../classification.csv").sample(frac=1, random_state=42).reset_index(drop=True)

c_df = c_df.drop(columns=['instance_id', 'obtained_date', 'track_name', 'artist_name'])


sc = StandardScaler()
scaled = sc.fit_transform(c_df[['loudness', 'acousticness', 'energy']])
pca = PCA(n_components=2)
c_df[['pc1', 'pc2']] = pca.fit_transform(scaled)
c_df = c_df.drop(columns=['loudness', 'acousticness', 'energy'])

c_df['tempo'] = pd.to_numeric(c_df['tempo'], errors='coerce')

le = LabelEncoder()
c_df['music_genre'] = le.fit_transform(c_df['music_genre'])
c_df['mode'] = le.fit_transform(c_df['mode'])

ohe = OneHotEncoder(sparse_output=False, drop='first')
encoded_key = ohe.fit_transform(c_df[['key']])
encoded_df_key = pd.DataFrame(encoded_key, columns=ohe.get_feature_names_out(['key']))
c_df = c_df.drop(columns=['key']).reset_index(drop=True)
c_df = pd.concat([c_df, encoded_df_key], axis=1)

c_df['duration_ms'] = c_df['duration_ms'].replace(-1, np.nan)

c_df['instrumental_flag'] = (c_df['instrumentalness'] > 0.05).astype(int)
c_df = c_df.drop(columns=['instrumentalness'])

c_df['undefined_tempo'] = c_df['tempo'].isna().astype(int)

median_tempo = c_df['tempo'].median()
c_df['tempo'] = c_df['tempo'].fillna(median_tempo)

median_duration = c_df['duration_ms'].median()
c_df['duration_ms'] = c_df['duration_ms'].fillna(median_duration)

float_features = [
    'popularity', 'danceability', 'duration_ms',
    'liveness', 'speechiness', 'tempo',
    'valence', 'pc1', 'pc2'
]

other_features = [
    'mode', 'instrumental_flag', 'undefined_tempo'
] + list(encoded_df_key.columns) 

X_c = c_df[float_features + other_features]
y_c = c_df['music_genre']

X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(
    X_c, y_c, test_size=0.2, stratify=y_c, random_state=42
)

Перейдём к обучению

In [None]:
gb = HistGradientBoostingClassifier(
    random_state=42
)

param_grid = {
    "max_iter": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 4],
    "min_samples_leaf": [20, 50]
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

grid = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_c_train, y_c_train)

print("Best params:", grid.best_params_)


best_gb = grid.best_estimator_

y_pred = best_gb.predict(X_c_test)

print("Score:", accuracy_score(y_c_test, y_pred))

print(classification_report(y_c_test, y_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 200, 'min_samples_leaf': 50}
Score: 0.5813
              precision    recall  f1-score   support

           0       0.46      0.37      0.41      1000
           1       0.78      0.74      0.76      1000
           2       0.61      0.51      0.56      1000
           3       0.84      0.83      0.84      1000
           4       0.58      0.59      0.59      1000
           5       0.65      0.62      0.63      1000
           6       0.44      0.49      0.46      1000
           7       0.55      0.52      0.53      1000
           8       0.43      0.41      0.42      1000
           9       0.51      0.73      0.60      1000

    accuracy                           0.58     10000
   macro avg       0.59      0.58      0.58     10000
weighted avg       0.59      0.58      0.58     10000



##### Создание бейзлайна для модели регрессии

Сделаем всё то же, что и ранее

In [52]:
r_base_df = pd.read_csv("../regression.csv").sample(frac=1, random_state=42).reset_index(drop=True)

r_base_df['Date'] = pd.to_datetime(r_base_df['Date'], dayfirst=True)

r_base_df["Year"] = r_base_df["Date"].dt.year
r_base_df["Month"] = r_base_df["Date"].dt.month
r_base_df["Day"] = r_base_df["Date"].dt.day

r_base_df = r_base_df.drop(columns=['Date'])

per_store_count = r_base_df.groupby('Store').size().iloc[0]
k = max(1, int(np.round(0.8 * per_store_count))) 
store_counts = r_base_df['Store'].nunique()

train = r_base_df.iloc[: store_counts * k]
test = r_base_df.iloc[store_counts * k :]

X_r_base_train = train.drop(columns=['Weekly_Sales'])
X_r_base_test = test.drop(columns=['Weekly_Sales'])
y_r_base_train = train['Weekly_Sales']
y_r_base_test = test['Weekly_Sales']

И теперь обучим модель

In [4]:
gbr_bad = GradientBoostingRegressor(
    n_estimators=50,    
    learning_rate=0.3,    
    max_depth=2,           
    subsample=0.6,      
    min_samples_leaf=20,   
    random_state=42
)

gbr_bad.fit(X_r_base_train, y_r_base_train)

y_pred = gbr_bad.predict(X_r_base_test)

mae = mean_absolute_error(y_r_base_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_r_base_test, y_pred))
r2 = r2_score(y_r_base_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 179590.59773806954
RMSE: 232642.85236654413
R2: 0.832710565668755


##### Улучшение бейзлайна для модели регрессии

Сначала повторим техники из предыдущей ЛР

In [55]:
r_df = pd.read_csv("../regression.csv").sample(frac=1, random_state=42).reset_index(drop=True)

r_df['Date'] = pd.to_datetime(r_df['Date'], dayfirst=True)

r_df['Year'] = r_df['Date'].dt.year
r_df['Week'] = r_df['Date'].dt.isocalendar().week

r_df = r_df.drop(columns=['Date'])

cat_store = ['Store', 'Week']
other_feats = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Holiday_Flag']

per_store_count = r_df.groupby('Store').size().iloc[0]
k = max(1, int(np.round(0.8 * per_store_count))) 
store_counts = r_df['Store'].nunique()

train = r_df.iloc[: store_counts * k]
test = r_df.iloc[store_counts * k :]

X_r_train = train.drop(columns=['Weekly_Sales'])
X_r_test = test.drop(columns=['Weekly_Sales'])
y_r_train = train['Weekly_Sales']
y_r_test = test['Weekly_Sales']

warnings.filterwarnings("ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_store),
        ('passth', 'passthrough', other_feats)
    ],
    remainder='drop'
)

Перейдём к обучению

In [None]:
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", GradientBoostingRegressor(random_state=42))
    ]
)

param_grid = {
    "model__n_estimators": [200, 300],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [3, 4],
    "model__subsample": [0.8, 1.0],
    "model__min_samples_leaf": [5, 20]
}

cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_r_train, y_r_train)

print("Best params:", grid.best_params_)
best_model = grid.best_estimator_
y_test_pred  = best_model.predict(X_r_test)

mae = mean_absolute_error(y_r_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_r_test, y_test_pred))
r2 = r2_score(y_r_test, y_test_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best params: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__min_samples_leaf': 20, 'model__n_estimators': 300, 'model__subsample': 1.0}
MAE: 66673.26203114892
RMSE: 107858.98478310673
R2: 0.9640414855804225


##### Базовый класс имплементации

In [None]:
class BaseGradientBoosting:

    def __init__(self, learning_rate=0.1, max_depth=3, 
                 min_samples_leaf=1, n_estimators=100, 
                 subsample=1.0, random_state=None, min_samples_split=2):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.n_estimators = n_estimators
        self.subsample = subsample
        self.random_state = random_state

        self.trees = []  # список слабых моделей
        self.initial_prediction = None  # начальные предсказания

    def _init_prediction(self, y):
        raise NotImplementedError

    def _loss_gradient(self, y, y_pred):
        raise NotImplementedError

    def fit(self, X, y):
        raise NotImplementedError

    def predict_raw(self, X):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError


##### Классификатор

In [None]:
class MyGradientBoostingClassifier(BaseGradientBoosting):
    """
    Многоклассовый градиентный бустинг.
    Использует softmax и кросс-энтропию.
    """

    def _init_prediction(self, y):
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        counts = np.array([np.sum(y == c) for c in self.classes_], dtype=np.float32)
        probs = counts / counts.sum()
        init_pred = np.log(np.clip(probs, 1e-6, 1-1e-6)) 
        return init_pred
    
    def _loss_gradient(self, y, y_pred):
        """
        Градиент кросс-энтропии:
        gradient = y_one_hot - softmax(y_pred)
        """
        n_samples = y.shape[0]
        n_classes = len(self.classes_)
        # one-hot encoding
        y_one_hot = np.zeros((n_samples, n_classes), dtype=np.float32)
        for idx, c in enumerate(self.classes_):
            y_one_hot[:, idx] = (y == c).astype(np.float32)
        # softmax по строкам
        p = softmax(y_pred, axis=1)
        grad = y_one_hot - p
        return grad

    def fit(self, X, y):
        """
        Обучение многоклассового градиентного бустинга
        """
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples = X.shape[0]
        n_classes = len(np.unique(y))

        # Начальное предсказание (логиты)
        self.initial_prediction = self._init_prediction(y)
        y_pred = np.tile(self.initial_prediction, (n_samples, 1))  

        self.trees = [[] for _ in range(n_classes)] 
        rng = np.random.default_rng(self.random_state)

        for i in range(self.n_estimators):
            # Вычисляем градиенты для каждого класса
            gradient = self._loss_gradient(y, y_pred) 

            for k in range(n_classes):
                # Подвыборка для дерева
                if self.subsample < 1.0:
                    indices = rng.choice(n_samples, int(n_samples * self.subsample), replace=False)
                    X_sub, grad_sub = X[indices], gradient[indices, k]
                else:
                    X_sub, grad_sub = X, gradient[:, k]

                # Обучаем дерево
                tree = DecisionTreeRegressor(
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    min_samples_split=self.min_samples_split,
                    random_state=self.random_state
                )
                tree.fit(X_sub, grad_sub)

                # Обновляем предсказание
                update = tree.predict(X).astype(np.float32)
                y_pred[:, k] += self.learning_rate * update

                # Сохраняем дерево
                self.trees[k].append(tree)

                del X_sub, grad_sub, update

        return self

    def predict_raw(self, X):
        X = np.asarray(X)
        n_samples = X.shape[0]
        y_pred = np.tile(self.initial_prediction, (n_samples, 1))

        for k, trees_k in enumerate(self.trees):
            for tree in trees_k:
                y_pred[:, k] += self.learning_rate * tree.predict(X).astype(np.float32)
        return y_pred

    def predict_proba(self, X):
        """
        Вероятности для всех классов
        """
        y_raw = self.predict_raw(X)
        return softmax(y_raw, axis=1)

    def predict(self, X):
        """
        Предсказанные классы
        """
        probs = self.predict_proba(X)
        class_idx = np.argmax(probs, axis=1)
        return self.classes_[class_idx]


Обучим на данных бейзлайна

In [49]:
gb = MyGradientBoostingClassifier(
    n_estimators=20,
    learning_rate=0.8,
    max_depth=1,
    min_samples_split=50,
    subsample=0.6
)

gb.fit(X_c_base_train, y_c_base_train)
y_pred = gb.predict(X_c_base_test)

accuracy = accuracy_score(y_c_base_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4681


In [50]:
print(classification_report(y_c_base_test, y_pred))

              precision    recall  f1-score   support

           0       0.38      0.28      0.32      1000
           1       0.59      0.62      0.61      1000
           2       0.38      0.37      0.38      1000
           3       0.71      0.79      0.75      1000
           4       0.55      0.44      0.49      1000
           5       0.44      0.40      0.42      1000
           6       0.42      0.42      0.42      1000
           7       0.41      0.22      0.29      1000
           8       0.42      0.31      0.35      1000
           9       0.39      0.83      0.53      1000

    accuracy                           0.47     10000
   macro avg       0.47      0.47      0.45     10000
weighted avg       0.47      0.47      0.45     10000



Обучим на данных улучшенной модели. Сразу будем использовать параметры, полученные при подборе гиперпараметров

In [51]:
gb = MyGradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=3,
    n_estimators=200,
    min_samples_leaf=50
)


gb.fit(X_c_train, y_c_train)

y_pred = gb.predict(X_c_test)

print("Score:", accuracy_score(y_c_test, y_pred))

print(classification_report(y_c_test, y_pred))

Score: 0.5431
              precision    recall  f1-score   support

           0       0.40      0.34      0.37      1000
           1       0.71      0.69      0.70      1000
           2       0.54      0.46      0.50      1000
           3       0.81      0.81      0.81      1000
           4       0.54      0.50      0.52      1000
           5       0.58      0.52      0.55      1000
           6       0.47      0.48      0.47      1000
           7       0.48      0.42      0.45      1000
           8       0.45      0.45      0.45      1000
           9       0.48      0.77      0.59      1000

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.54      0.54      0.54     10000



Имеем скоры:

**Бейзлайн sklearn** - 0.5455

**Улучшенная модель sklearn** - 0.5813

**Мой бейзлайн** - 0.4681

**Мой улучшенный бейзлайн** - 0.5431

Моя улучшенная модель едва подбирается к бейзлайну sklearn. Но в целом получилось неплохо

##### Регрессор

In [None]:
class MyGradientBoostingRegressor(BaseGradientBoosting):
    """
    Градиентный бустинг для регрессии.
    """

    def _init_prediction(self, y):
        """
        Начальное предсказание — среднее по целевой переменной.
        """
        return np.array(np.mean(y), dtype=np.float32)

    def _loss_gradient(self, y, y_pred):
        """
        Градиент MSE: y - y_pred
        """
        return y - y_pred

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y, dtype=np.float32)
        n_samples = X.shape[0]

        # Начальное предсказание (среднее)
        self.initial_prediction = self._init_prediction(y)
        y_pred = np.full(y.shape, self.initial_prediction, dtype=np.float32)

        rng = np.random.default_rng(self.random_state)
        self.trees = []

        for i in range(self.n_estimators):
            # Вычисляем градиент
            gradient = self._loss_gradient(y, y_pred)

            # Подвыборка для дерева
            if self.subsample < 1.0:
                indices = rng.choice(n_samples, int(n_samples * self.subsample), replace=False)
                X_sub, grad_sub = X[indices], gradient[indices]
            else:
                X_sub, grad_sub = X, gradient

            # Обучаем дерево
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                min_samples_split=self.min_samples_split,
                random_state=self.random_state
            )
            tree.fit(X_sub, grad_sub)

            # Обновляем предсказание
            update = tree.predict(X).astype(np.float32)
            y_pred += self.learning_rate * update

            # Сохраняем дерево
            self.trees.append(tree)

            # Очистка для экономии памяти
            del gradient, update, X_sub, grad_sub

        return self

    def predict_raw(self, X):
        """
        Предсказание до применения каких-либо финальных функций 
        """
        X = np.asarray(X)
        y_pred = np.full((X.shape[0],), self.initial_prediction, dtype=np.float32)
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X).astype(np.float32)
        return y_pred

    def predict(self, X):
        return self.predict_raw(X)


Обучим на данных бейзлайна

In [54]:
gbr_bad = MyGradientBoostingRegressor(
    n_estimators=50,    
    learning_rate=0.3,    
    max_depth=2,           
    subsample=0.6,      
    min_samples_leaf=20
)

gbr_bad.fit(X_r_base_train, y_r_base_train)

y_pred = gbr_bad.predict(X_r_base_test)

mae = mean_absolute_error(y_r_base_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_r_base_test, y_pred))
r2 = r2_score(y_r_base_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 181877.8959808429
RMSE: 233175.45168499125
R2: 0.8319437230512261


Обучим на данных улучшенной модели. Сразу будем использовать параметры, полученные при подборе гиперпараметров

In [56]:
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", MyGradientBoostingRegressor(min_samples_split=2, n_estimators=300, learning_rate=0.1, max_depth=4, min_samples_leaf=20, subsample=1))
    ]
)

pipe.fit(X_r_train, y_r_train)

y_test_pred  = pipe.predict(X_r_test)

mae = mean_absolute_error(y_r_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_r_test, y_test_pred))
r2 = r2_score(y_r_test, y_test_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 66664.9429322318
RMSE: 107814.45816134539
R2: 0.9640711684239329


Получили

| Метрика | Бейзлайн бустинга | Бейзлайн имплементации | Улучшенный бустинг | Улучшенная имплементация |
|-|-|-|-|-|
| MAE | 179591 | 181878 | 66673 | 66665 |
| RMSE | 232643 | 233175 | 107859 | 107814 |
| R2 | 0.833 | 0.832 | 0.964 | 0.964 |

Везде получились практически одинаковые метрики. Это успех