<a href="https://colab.research.google.com/github/Existanze54/sirius-machine-learning-2025/blob/main/Seminars/GenTech/S9_DT_RF_GT25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Семинар 9. Деревья решений и случайный лес

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.neighbors import KNeighborsClassifier as kNN

from sklearn.tree import DecisionTreeClassifier as TreeClass
from sklearn.tree import DecisionTreeRegressor as TreeReg

from sklearn.ensemble import BaggingClassifier as Bagg
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

### Задача 1. Визуальная оценка

#### Генерация данных

Помните домашку?

In [None]:
f = lambda x: np.sin(3*x) * np.exp(-(x/1.5)**2)

x = np.arange(-3, 3.01, 0.01)
y = f(x)

n_points = 75
np.random.seed(3)


x_train = np.random.uniform(-2.5, 2.51, n_points)
y_train = f(x_train) + np.random.normal(0, 0.15, n_points)

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)

plt.xlim([-3, 3])
plt.show()

#### Построим деревянный регрессор

In [None]:
model = TreeReg()
model.fit(x_train.reshape(-1, 1), y_train)
y_pred = model.predict(x.reshape(-1, 1))

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)
plt.plot(x, y_pred)

plt.xlim([-3, 3])
plt.show()

#### Давайте регуляризуем наше дерево

In [None]:
model = TreeReg(max_depth=4)
model.fit(x_train.reshape(-1, 1), y_train)
y_pred = model.predict(x.reshape(-1, 1))

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)
plt.plot(x, y_pred)

plt.xlim([-3, 3])
plt.show()

In [None]:
model = TreeReg(min_samples_leaf=3)
model.fit(x_train.reshape(-1, 1), y_train)
y_pred = model.predict(x.reshape(-1, 1))

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)
plt.plot(x, y_pred)

plt.xlim([-3, 3])
plt.show()

In [None]:
model = TreeReg(min_impurity_decrease=0.001)
model.fit(x_train.reshape(-1, 1), y_train)
y_pred = model.predict(x.reshape(-1, 1))

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)
plt.plot(x, y_pred)

plt.xlim([-3, 3])
plt.show()

In [None]:
from sklearn.tree import plot_tree
plot_tree(model)
plt.show()

In [None]:
model = RFR(min_samples_leaf=3, random_state=0)

model.fit(x_train.reshape(-1, 1), y_train)
y_pred = model.predict(x.reshape(-1, 1))

plt.plot(x, y, '--')
plt.scatter(x_train, y_train, s=20)
plt.plot(x, y_pred)

plt.xlim([-3, 3])
plt.show()

### Задача 2. Бэггинг

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [None]:
import seaborn as sns
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target
y = (y - 1).abs()

X = StandardScaler().fit_transform(X)

In [None]:
models = {
    'LogReg': LogReg(),
    'SVC': SVC(),
    'kNN': kNN(),
    'Tree': TreeClass(),
    'RF': RFC(random_state=0),
}

X, y = load_breast_cancer(return_X_y=True)

scores = {}

for name, base in models.items():
    if name != 'RF':
        clf = Bagg(base, n_estimators=25, random_state=0)
    else:
        clf = base
    cv = cross_validate(clf, X, y, scoring='f1',
                        cv=5, n_jobs=-1,
                        return_train_score=False)
    scores[name] = cv['test_score']

df = pd.DataFrame(scores)
sns.boxplot(data=df)
plt.ylabel('F1')

plt.show()

### Задача 3. Важность признаков

In [None]:
from sklearn.datasets import load_wine

X, y = load_wine(return_X_y=True, as_frame=True)
model = RFC(random_state=0)
model.fit(X, y)

pd.Series(model.feature_importances_, index=X.columns).sort_values().plot.barh()
plt.show()