<a href="https://colab.research.google.com/github/Existanze54/sirius-machine-learning-2025/blob/main/Seminars/GenTech/S6_CV_GT25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Семинар 6. Кросс-валидация и отбор признаков

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def train_test_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    return pd.DataFrame({'Accuracy': [acc],
                         'Precision': [prec],
                         'Recall': [rec]})

### 1. LogReg and Hold-out

#### Подготовка данных


Снова поработаем с раком молочной железы на датасете '[UCI Breast Cancer Wisconsin](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)'.

In [None]:
data = load_breast_cancer(as_frame=True)

X = data.data
y = data.target

y.map(lambda x: data.target_names[x]).value_counts()

Инвертируем таргет так, чтобы метка `1` обозначала злокачественную опухоль.

In [None]:
y = (y - 1).abs()

Разобьем и протестируем.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
                                                    random_state=4)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogReg(penalty=None)
train_test_model(model, X_train, X_test, y_train, y_test)

Какая метрика нам важнее? Давайте отделим валидационную выборку и максимизируем по ней.

In [None]:
X_train_full = X_train
y_train_full = y_train

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full,
                                                  stratify=y_train_full, random_state=4)

In [None]:
scores = []
Cs = [0.01, 0.025, 0.05, 0.1, 0.5, 1, 5, 10, 100]

for c in Cs:
    model = LogReg(penalty='l1', C=c,
                   random_state=0,
                   max_iter=10000,
                   solver='saga',
                   n_jobs=-1)

    result = train_test_model(model, X_train, X_val,
                              y_train, y_val)
    scores.append(result['Recall'].item())

In [None]:
plt.plot(Cs, scores)
plt.xscale('log')

plt.title(f"Best recall value is {max(scores):.2f} with C = {Cs[np.argmax(scores)]}")
plt.show()

In [None]:
model = LogReg(penalty='l1', C=0.5, max_iter=10000, solver='saga', random_state=0)
train_test_model(model, X_train, X_test, y_train, y_test)

In [None]:
model = LogReg(penalty='l1', C=0.5, max_iter=10000, solver='saga', random_state=0)
train_test_model(model, X_train_full, X_test, y_train_full, y_test)

### 2. kNN and k-Folds

<img src="https://i.sstatic.net/m68dT.png" width='500'>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
                                                    random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = kNN(25)
train_test_model(model, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {
    'metric': ['euclidean', 'manhattan', 'cosine'],
    'weights': ['uniform', 'distance'],
    'n_neighbors': range(3, 26, 2),
}

searcher = GridSearchCV(kNN(), grid,
                        scoring='recall',
                        verbose=1,
                        n_jobs=-1)
searcher.fit(X_train, y_train)

print('Best hyperparameters:', searcher.best_params_)

In [None]:
result = searcher.cv_results_
df = pd.DataFrame(result)

sns.boxplot(df, x='param_n_neighbors', y='mean_test_score')
plt.show()

In [None]:
model = kNN(**searcher.best_params_)
train_test_model(model, X_train, X_test, y_train, y_test)

### 3. Задача

Работаем с тем же датасетом. Используя `GridSearchCV`, для `SVC` выберите ядро из списка `['linear', 'poly', 'rbf']` и регуляризующий параметр из `[0.01, 0.1, 1, 10, 100]`. Для полиномиального ядра также отберите степень из диапазона `range(2, 10)`. Как и прежде, максимизируйте $Recall$.

In [None]:
# your code here