Импорт библиотек

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
RANDOM_STATE = 42

In [3]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

# 1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [64]:
# Разбиваем выборку на обучающую и тестовую с помьщью функции train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)  # добавляем random stete для повторяемости результата

# Выводим размеры тестовой и обучающей выборок и целевой переменной
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39073, 14), (9769, 14), (39073,), (9769,))

# 2. Обучите стандартную регрессию, а также Ridge и Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

Линейная регрессия

In [5]:
linear = LinearRegression()
# обучаем модель на обучающей выборке
linear.fit(X_train, y_train)
# рассчет R2 для тестовой выборки
Linear_r2 = r2_score(y_test, linear.predict(X_test))

Линейная регрессия с регуляризацией L2 (Ridge) с параметрами по умолчанию

In [6]:
ridge = Ridge()
# обучаем модель на обучающей выборке
ridge.fit(X_train, y_train)
# рассчет R2 для тестовой выборки
Ridge_r2 = r2_score(y_test, ridge.predict(X_test))

Линейная регрессия с регуляризацией L1 (Lasso) с параметрами по умолчанию

In [7]:
lasso = Lasso()
# обучаем модель на обучающей выборке
lasso.fit(X_train, y_train)
# рассчет R2 для тестовой выборки
Lasso_r2 = r2_score(y_test, lasso.predict(X_test))

Вывод R2 (коэффициентов детерминации) для линейной регрессии, Ridge и Lasso

In [8]:
print(f"R2 Linear regression: {Linear_r2:.6f}")
print(f"R2 Ridge regression:  {Ridge_r2:.6f}")
print(f"R2 Lasso regression:  {Lasso_r2:.6f}")

R2 Linear regression: 0.668759
R2 Ridge regression:  0.666222
R2 Lasso regression:  0.667145


R2 для линейной регрессии, Ridge и Lasso с параметрами по умолчанию сопоставимы

# 3. Для Ridge и Lasso подберите коэффициент регуляризации (используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

GridSearchCV

In [9]:
parameters = {'alpha':np.logspace(-5, 5, 11)}

# подбираем коэффициент регуляризации для Ridge с помощью GridSearchCV
Ridge_GSCV = GridSearchCV(Ridge(), parameters)
Ridge_GSCV.fit(X_train, y_train)

# подбираем коэффициент регуляризации для Lasso с помощью GridSearchCV
Lasso_GSCV = GridSearchCV(Lasso(), parameters)
Lasso_GSCV.fit(X_train, y_train)

# Вывод лучших коэффициентов регуляризации
Ridge_GSCV.best_estimator_, Lasso_GSCV.best_estimator_

(Ridge(alpha=1e-05), Lasso(alpha=1e-05))

In [10]:
# Ridge регрессия с лучшим коэффициентом регуляризации
Ridge_GSCV_best = Ridge_GSCV.best_estimator_
# Рассчет R2 для тестовой выборки
Ridge_GSCV_r2 = r2_score(y_test, Ridge_GSCV_best.predict(X_test))

# Lasso регрессия с лучшим коэффициентом регуляризации
Lasso_GSCV_best = Lasso_GSCV.best_estimator_
# Рассчет R2 для тестовой выборки
Lasso_GSCV_r2 = r2_score(y_test, Lasso_GSCV_best.predict(X_test))

print(f"R2 Ridge regression with GridSearchCV: {Ridge_GSCV_r2:.6f}")
print(f"R2 Lasso regression with GridSearchCV: {Lasso_GSCV_r2:.6f}")

R2 Ridge regression with GridSearchCV: 0.668759
R2 Lasso regression with GridSearchCV: 0.668760


RidgeCV

In [11]:
# подбираем коэффициент регуляризации для Ridge с помощью RidgeCV
RidCV = RidgeCV(alphas = np.logspace(-5, 5, 11))
RidCV.fit(X_train, y_train)
print(f"alpha={RidCV.alpha_}")

alpha=0.01


В ходе теста определено, что предсказание происходит не по лучшей модели, а по какой-то средней

In [12]:
# Обучим модель с подобранным коэффициентом регуляризации с помощью RidgeCV
RidCV_best = Ridge(alpha=RidCV.alpha_)
RidCV_best.fit(X_train, y_train)
# Рассчет R2 для тестовой выборки
Ridge_RidCV_r2 = r2_score(y_test, RidCV_best.predict(X_test))
print(f"R2 Ridge regression with RidgeCV: {Ridge_RidCV_r2:.6f}")

R2 Ridge regression with RidgeCV: 0.668751


LassoCV

In [13]:
# подбираем коэффициент регуляризации для Lasso с помощью LassoCV
LassCV = LassoCV(alphas = np.logspace(-5, 5, 11))
LassCV.fit(X_train, y_train)
print(f"alpha={LassCV.alpha_:.5f}")

alpha=0.00001


В ходе теста определено, что предсказание происходит по лучшей модели

In [14]:
# Расчет R2 для тестовой выборки
Lasso_LassCV_r2 = r2_score(y_test, LassCV.predict(X_test))
print(f"R2 Lasso regression with LassoCV: {Lasso_LassCV_r2:.6f}")

R2 Lasso regression with LassoCV: 0.668760


Сравнение R2

In [15]:
print(f"R2 Linear regression:                 {Linear_r2:.6f}")
print()
print(f"R2 Ridge regression Default:          {Ridge_r2:.6f}")
print(f"R2 Ridge regression withGridSearchCV: {Ridge_GSCV_r2:.6f}")
print(f"R2 Ridge regression withRidgeCV:      {Ridge_RidCV_r2:.6f}")
print()
print(f"R2 Lasso regression Default:          {Lasso_r2:.6f}")
print(f"R2 Lasso regression withGridSearchCV: {Lasso_GSCV_r2:.6f}")
print(f"R2 Lasso regression withLassoCV:      {Lasso_LassCV_r2:.6f}")

R2 Linear regression:                 0.668759

R2 Ridge regression Default:          0.666222
R2 Ridge regression withGridSearchCV: 0.668759
R2 Ridge regression withRidgeCV:      0.668751

R2 Lasso regression Default:          0.667145
R2 Lasso regression withGridSearchCV: 0.668760
R2 Lasso regression withLassoCV:      0.668760


Выводы:  
- R2 для Ridge с подобранными коэффициентами регуляризации с помощью GridSearchCV и RidgeCV незначительно различаются (коэффициенты отличаются);  
- R2 для Lasso с подобранными коэффициентами регуляризации с помощью GridSearchCV и LassoCV не различаются (коэффициенты одинаковы);
- R2 и для Ridge и для Lasso с подобранными коэффициентами регуляризации несколько лучше, чем для Ridge и для Lasso с параметрами по умолчанию (для данных выборок) - на 0,25% для Ridge  и на 0,15% для Lasso  
- R2 и для Ridge и для Lasso с подобранными коэффициентами регуляризации практачески равен R2 для линейной регрессии (аналитический метод)

# 4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

StandardScaler

In [16]:
# Линейная регрессия на масштабированных данных (StandardScaler)
Linear_StScal = Pipeline(steps = [("scaler", StandardScaler()), ("regression", LinearRegression())])
Linear_StScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Linear_StScal_r2 = r2_score(y_test, Linear_StScal.predict(X_test))
print(f"R2 Linear regression with StandardScaler: {Linear_StScal_r2:.6f}")

R2 Linear regression with StandardScaler: 0.668759


In [17]:
# Ridge регрессия на масштабированных данных (StandardScaler)
Ridge_StScal = Pipeline(steps = [("scaler", StandardScaler()), ("regression", Ridge())])
Ridge_StScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Ridge_StScal_r2 = r2_score(y_test, Ridge_StScal.predict(X_test))
print(f"R2 Ridge regression with StandardScaler: {Ridge_StScal_r2:.6f}")

R2 Ridge regression with StandardScaler: 0.668462


In [18]:
# Lasso регрессия на масштабированных данных (StandardScaler)
Lasso_StScal = Pipeline(steps = [("scaler", StandardScaler()), ("regression", Lasso())])
Lasso_StScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Lasso_StScal_r2 = r2_score(y_test, Lasso_StScal.predict(X_test))
print(f"R2 Lasso regression with StandardScaler: {Lasso_StScal_r2:.6f}")

R2 Lasso regression with StandardScaler: 0.623943


MinMaxScaler

In [19]:
# Линейная регрессия на масштабированных данных (MinMaxScaler)
Linear_MinMaxScal = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", LinearRegression())])
Linear_MinMaxScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Linear_MinMaxScal_r2 = r2_score(y_test, Linear_MinMaxScal.predict(X_test))
print(f"R2 Linear regression with MinMaxScaler: {Linear_MinMaxScal_r2:.6f}")

R2 Linear regression with MinMaxScaler: 0.668759


In [20]:
# Ridge регрессия на масштабированных данных (MinMaxScaler)
Ridge_MinMaxScal = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", Ridge())])
Ridge_MinMaxScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Ridge_MinMaxScal_r2 = r2_score(y_test, Ridge_MinMaxScal.predict(X_test))
print(f"R2 Ridge regression with MinMaxScaler: {Ridge_MinMaxScal_r2:.6f}")

R2 Ridge regression with MinMaxScaler: 0.676410


In [21]:
# Lasso регрессия на масштабированных данных (MinMaxScaler)
Lasso_MinMaxScal = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", Lasso())])
Lasso_MinMaxScal.fit(X_train, y_train)

# Расчет R2 для тестовой выборки
Lasso_MinMaxScal_r2 = r2_score(y_test, Lasso_MinMaxScal.predict(X_test))
print(f"R2 Lasso regression with MinMaxScaler: {Lasso_MinMaxScal_r2:.6f}")

R2 Lasso regression with MinMaxScaler: 0.257392


Сравнение R2

In [22]:
print(f"R2 Linear regression:                     {Linear_r2:.6f}")
print(f"R2 Linear regression with StandardScaler: {Linear_StScal_r2:.6f}")
print(f"R2 Linear regression with MinMaxScaler:   {Linear_MinMaxScal_r2:.6f}")
print()
print(f"R2 Ridge regression Default:              {Ridge_r2:.6f}")
print(f"R2 Ridge regression with GridSearchCV:    {Ridge_GSCV_r2:.6f}")
print(f"R2 Ridge regression with RidgeCV:         {Ridge_RidCV_r2:.6f}")
print(f"R2 Ridge regression with StandardScaler:  {Ridge_StScal_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler:    {Ridge_MinMaxScal_r2:.6f}")
print()
print(f"R2 Lasso regression Default:              {Lasso_r2:.6f}")
print(f"R2 Lasso regression with GridSearchCV:    {Lasso_GSCV_r2:.6f}")
print(f"R2 Lasso regression with LassoCV:         {Lasso_LassCV_r2:.6f}")
print(f"R2 Lasso regression with StandardScaler:  {Lasso_StScal_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler:    {Lasso_MinMaxScal_r2:.6f}")

R2 Linear regression:                     0.668759
R2 Linear regression with StandardScaler: 0.668759
R2 Linear regression with MinMaxScaler:   0.668759

R2 Ridge regression Default:              0.666222
R2 Ridge regression with GridSearchCV:    0.668759
R2 Ridge regression with RidgeCV:         0.668751
R2 Ridge regression with StandardScaler:  0.668462
R2 Ridge regression with MinMaxScaler:    0.676410

R2 Lasso regression Default:              0.667145
R2 Lasso regression with GridSearchCV:    0.668760
R2 Lasso regression with LassoCV:         0.668760
R2 Lasso regression with StandardScaler:  0.623943
R2 Lasso regression with MinMaxScaler:    0.257392


Выводы:
- на линейной регрессии масштабирование не сказалось;
- для Ridge регрессии с масштабированием результат лучше, чем без масштабирования, особенно с помощью MinMaxScaler (R2 выше почти на 0,01);
- для Lasso регрессии с масштабированием результат оказался гораздо хуже, чем без масштабирования, особенно с помощью MinMaxScaler.

# 5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

Подбор коэффициентов регуляризации для Ridge и Lasso на масштабированных данных с помощью GridSearchCV

In [23]:
parameters = {'regression__alpha':np.logspace(-5, 5, 11)}


# создаем pipline (StandardScaler и Ridge регрессия)
Ridge_StScal = Pipeline(steps = [("scaler", StandardScaler()), ("regression", Ridge())])
# подбираем коэффициент регуляризации для pipline с помощью GridSearchCV
Ridge_StScal_GSCV = GridSearchCV(Ridge_StScal, parameters)
Ridge_StScal_GSCV.fit(X_train, y_train)
# расчет R2 на тестовых данных
Ridge_StScal_GSCV_r2 = r2_score(y_test, Ridge_StScal_GSCV.predict(X_test))


# создаем pipline (MinMaxScaler и Ridge регрессия)
Ridge_MinMaxScal = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", Ridge())])
# подбираем коэффициент регуляризации для pipline с помощью GridSearchCV
Ridge_MinMaxScal_GSCV = GridSearchCV(Ridge_MinMaxScal, parameters)
Ridge_MinMaxScal_GSCV.fit(X_train, y_train)
# расчет R2 на тестовых данных
Ridge_MinMaxScal_GSCV_r2 = r2_score(y_test, Ridge_MinMaxScal_GSCV.predict(X_test))


# создаем pipline (StandardScaler и Lasso регрессия)
Lasso_StScal = Pipeline(steps = [("scaler", StandardScaler()), ("regression", Lasso())])
# подбираем коэффициент регуляризации для pipline с помощью GridSearchCV
Lasso_StScal_GSCV = GridSearchCV(Lasso_StScal, parameters)
Lasso_StScal_GSCV.fit(X_train, y_train)
# расчет R2 на тестовых данных
Lasso_StScal_GSCV_r2 = r2_score(y_test, Lasso_StScal_GSCV.predict(X_test))


# создаем pipline (MinMaxScaler и Lasso регрессия)
Lasso_MinMaxScal = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", Lasso())])
# подбираем коэффициент регуляризации для pipline с помощью GridSearchCV
Lasso_MinMaxScal_GSCV = GridSearchCV(Lasso_MinMaxScal, parameters)
Lasso_MinMaxScal_GSCV.fit(X_train, y_train)
# расчет R2 на тестовых данных
Lasso_MinMaxScal_GSCV_r2 = r2_score(y_test, Lasso_MinMaxScal_GSCV.predict(X_test))


# Вывод лучших коэффициентов регуляризации
print(f"Подобранный с помощью GridSearchCV коэффициент регулярязации для Ridge на масштабированных данных StandardScaler: {Ridge_StScal_GSCV.best_params_.get('regression__alpha')}")
print(f"Подобранный с помощью GridSearchCV коэффициент регулярязации для Ridge на масштабированных данных MinMaxScaler:   {Ridge_MinMaxScal_GSCV.best_params_.get('regression__alpha')}")
print(f"Подобранный с помощью GridSearchCV коэффициент регулярязации для Lasso на масштабированных данных StandardScaler: {Lasso_StScal_GSCV.best_params_.get('regression__alpha'):.5f}")
print(f"Подобранный с помощью GridSearchCV коэффициент регулярязации для Lasso на масштабированных данных MinMaxScaler:   {Lasso_MinMaxScal_GSCV.best_params_.get('regression__alpha'):.5f}")
print()

# Вывод R2
print(f"R2 Ridge regression with StandardScaler & GridSearchCV: {Ridge_StScal_GSCV_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler   & GridSearchCV: {Ridge_MinMaxScal_GSCV_r2:.6f}")
print(f"R2 Lasso regression with StandardScaler & GridSearchCV: {Lasso_StScal_GSCV_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler   & GridSearchCV: {Lasso_MinMaxScal_GSCV_r2:.6f}")

Подобранный с помощью GridSearchCV коэффициент регулярязации для Ridge на масштабированных данных StandardScaler: 1.0
Подобранный с помощью GridSearchCV коэффициент регулярязации для Ridge на масштабированных данных MinMaxScaler:   0.1
Подобранный с помощью GridSearchCV коэффициент регулярязации для Lasso на масштабированных данных StandardScaler: 0.00001
Подобранный с помощью GridSearchCV коэффициент регулярязации для Lasso на масштабированных данных MinMaxScaler:   0.00001

R2 Ridge regression with StandardScaler & GridSearchCV: 0.668462
R2 Ridge regression with MinMaxScaler   & GridSearchCV: 0.670031
R2 Lasso regression with StandardScaler & GridSearchCV: 0.668759
R2 Lasso regression with MinMaxScaler   & GridSearchCV: 0.668761


Подбор коэффициентов регуляризации для Ridge на масштабированных данных с помощью RidgeCV

In [24]:
# создаем pipline (StandardScaler и RidgeCV регрессия)
Ridge_StScal_RidCV = Pipeline(steps = [("scaler", StandardScaler()), ("regression", RidgeCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью RidgeCV
Ridge_StScal_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Ridge_StScal_RidCV_r2 = r2_score(y_test, Ridge_StScal_RidCV.predict(X_test))

# создаем pipline (MinMaxScaler и RidgeCV регрессия)
Ridge_MinMaxScal_RidCV = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", RidgeCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью RidgeCV
Ridge_MinMaxScal_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Ridge_MinMaxScal_RidCV_r2 = r2_score(y_test, Ridge_MinMaxScal_RidCV.predict(X_test))

# выводим коэффициенты регуляризации и R2
print("Опытным путем определено, что RidgeCV на масштабированных данных с помощью StandardScaler подобран коэффициент регуляризации 10")
print("Опытным путем определено, что RidgeCV на масштабированных данных с помощью MinMaxScaler подобран коэффициент регуляризации  0.1")
print()
print(f"R2 Ridge regression with StandardScaler & RidgeCV: {Ridge_StScal_RidCV_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler   & RidgeCV: {Ridge_MinMaxScal_RidCV_r2:.6f}")

Опытным путем определено, что RidgeCV на масштабированных данных с помощью StandardScaler подобран коэффициент регуляризации 10
Опытным путем определено, что RidgeCV на масштабированных данных с помощью MinMaxScaler подобран коэффициент регуляризации  0.1

R2 Ridge regression with StandardScaler & RidgeCV: 0.665968
R2 Ridge regression with MinMaxScaler   & RidgeCV: 0.670031


Подбор коэффициентов регуляризации для Lasso на масштабированных данных с помощью LassoCV

In [25]:
# создаем pipline (StandardScaler и LassoCV регрессия)
Lasso_StScal_LassCV = Pipeline(steps = [("scaler", StandardScaler()), ("regression", LassoCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью LassoCV
Lasso_StScal_LassCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Lasso_StScal_LassCV_r2 = r2_score(y_test, Lasso_StScal_LassCV.predict(X_test))

# создаем pipline (MinMaxScaler и LassoCV регрессия)
Lasso_MinMaxScal_LassCV = Pipeline(steps = [("scaler", MinMaxScaler()), ("regression", LassoCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью LassoCV
Lasso_MinMaxScal_LassCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Lasso_MinMaxScal_LassCV_r2 = r2_score(y_test, Lasso_MinMaxScal_LassCV.predict(X_test))

# выводим коэффициенты регуляризации и R2
print("Опытным путем определено, что LassoCV на масштабированных данных с помощью StandardScaler подобран коэффициент регуляризации 0.00001")
print("Опытным путем определено, что LassoCV на масштабированных данных с помощью MinMaxScaler подобран коэффициент регуляризации   0.00001")
print()
print(f"R2 Lasso regression with StandardScaler & LassoCV: {Lasso_StScal_LassCV_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler   & LassoCV: {Lasso_MinMaxScal_LassCV_r2:.6f}")

Опытным путем определено, что LassoCV на масштабированных данных с помощью StandardScaler подобран коэффициент регуляризации 0.00001
Опытным путем определено, что LassoCV на масштабированных данных с помощью MinMaxScaler подобран коэффициент регуляризации   0.00001

R2 Lasso regression with StandardScaler & LassoCV: 0.668759
R2 Lasso regression with MinMaxScaler   & LassoCV: 0.668761


In [26]:
# сведем результаты расчета r2 в таблицу
r2_result = pd.DataFrame(data = [[np.NaN, Linear_r2, np.NaN, Linear_StScal_r2, np.NaN, Linear_MinMaxScal_r2],
                                 [1, Ridge_r2, 1, Ridge_StScal_r2, 1, Ridge_MinMaxScal_r2],
                                 [Ridge_GSCV.best_params_.get('alpha'), Ridge_GSCV_r2, Ridge_StScal_GSCV.best_params_.get('regression__alpha'), Ridge_StScal_GSCV_r2, Ridge_MinMaxScal_GSCV.best_params_.get('regression__alpha'), Ridge_MinMaxScal_GSCV_r2],
                                 [RidCV.alpha_, Ridge_RidCV_r2, 10, Ridge_StScal_RidCV_r2, 0.1, Ridge_MinMaxScal_RidCV_r2],
                                 [1, Lasso_r2, 1, Lasso_StScal_r2, 1, Lasso_MinMaxScal_r2],
                                 [Lasso_GSCV.best_params_.get('alpha'), Lasso_GSCV_r2, Lasso_StScal_GSCV.best_params_.get('regression__alpha'), Lasso_StScal_GSCV_r2, Lasso_MinMaxScal_GSCV.best_params_.get('regression__alpha'), Lasso_MinMaxScal_GSCV_r2],
                                 [LassCV.alpha_, Lasso_LassCV_r2, 0.00001, Lasso_StScal_LassCV_r2, 0.00001, Lasso_MinMaxScal_LassCV_r2]],
                         index=pd.MultiIndex.from_tuples([('Linear', 'No_alpha'),
                                                          ('Ridge', 'Default'), ('Ridge', 'GridSearchCV_best'), ('Ridge', 'RidgeCV_best'),
                                                          ('Lasso', 'Default'), ('Lasso', 'GridSearchCV_best'), ('Lasso', 'LassoCV_best')],
                                                         names=['Regression', 'Select_alpha']),
                         columns = pd.MultiIndex.from_tuples([('No_scaler', 'Alpha'), ('No_scaler', 'R2'),
                                                              ('StandardScaler', 'Alpha'), ('StandardScaler', 'R2'),
                                                              ('MinMaxScaler', 'Alpha'), ('MinMaxScaler', 'R2')],
                                                             names=['Scaler', 'Params']))
r2_result

Unnamed: 0_level_0,Scaler,No_scaler,No_scaler,StandardScaler,StandardScaler,MinMaxScaler,MinMaxScaler
Unnamed: 0_level_1,Params,Alpha,R2,Alpha,R2,Alpha,R2
Regression,Select_alpha,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Linear,No_alpha,,0.668759,,0.668759,,0.668759
Ridge,Default,1.0,0.666222,1.0,0.668462,1.0,0.67641
Ridge,GridSearchCV_best,1e-05,0.668759,1.0,0.668462,0.1,0.670031
Ridge,RidgeCV_best,0.01,0.668751,10.0,0.665968,0.1,0.670031
Lasso,Default,1.0,0.667145,1.0,0.623943,1.0,0.257392
Lasso,GridSearchCV_best,1e-05,0.66876,1e-05,0.668759,1e-05,0.668761
Lasso,LassoCV_best,1e-05,0.66876,1e-05,0.668759,1e-05,0.668761


Выводы:
- для Ridge регрессии на масштабированных данных подбор коэффициентов регуляризации назначительно ухудшил качество модели;
- для Lasso регрессии на масштабированных данных подбор коэффициентов регуляризации значительно улучшил качество модели (стало сопоставимо с подбором коэффициентов без масштабирования);

# 6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

Добавление попарных произведений масштабированных признаков и их квадратов с коэффициентом регуляризации по умолчанию

In [27]:
# создаем pipline (StandardScaler, PolynomialFeatures и линейная регрессия)
Linear_StScal_Polynom = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", LinearRegression())])
Linear_StScal_Polynom.fit(X_train, y_train)
Linear_StScal_Polynom.predict(X_test)
Linear_StScal_Polynom_r2 = r2_score(y_test, Linear_StScal_Polynom.predict(X_test))

# создаем pipline (MinMaxScaler, PolynomialFeatures и линейная регрессия)
Linear_MinMaxScal_Polynom = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", LinearRegression())])
Linear_MinMaxScal_Polynom.fit(X_train, y_train)
Linear_MinMaxScal_Polynom.predict(X_test)
Linear_MinMaxScal_Polynom_r2 = r2_score(y_test, Linear_MinMaxScal_Polynom.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и Ridge регрессия)
Ridge_StScal_Polynom = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_StScal_Polynom.fit(X_train, y_train)
Ridge_StScal_Polynom.predict(X_test)
Ridge_StScal_Polynom_r2 = r2_score(y_test, Ridge_StScal_Polynom.predict(X_test))
# создаем pipline (MinMaxScaler, PolynomialFeatures и Ridge регрессия)
Ridge_MinMaxScal_Polynom = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_MinMaxScal_Polynom.fit(X_train, y_train)
Ridge_MinMaxScal_Polynom.predict(X_test)
Ridge_MinMaxScal_Polynom_r2 = r2_score(y_test, Ridge_MinMaxScal_Polynom.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и Lasso регрессия)
Lasso_StScal_Polynom = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_StScal_Polynom.fit(X_train, y_train)
Lasso_StScal_Polynom.predict(X_test)
Lasso_StScal_Polynom_r2 = r2_score(y_test, Lasso_StScal_Polynom.predict(X_test))

# создаем pipline (MinMaxScaler, PolynomialFeatures и Lasso регрессия)
Lasso_MinMaxScal_Polynom = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_MinMaxScal_Polynom.fit(X_train, y_train)
Lasso_MinMaxScal_Polynom.predict(X_test)
Lasso_MinMaxScal_Polynom_r2 = r2_score(y_test, Lasso_MinMaxScal_Polynom.predict(X_test))

# вывод R2
print(f"R2 Linear regression with StandardScaler & PolynomialFeatures(degree=2): {Linear_StScal_Polynom_r2:.6f}")
print(f"R2 Linear regression with MinMaxScaler   & PolynomialFeatures(degree=2): {Linear_MinMaxScal_Polynom_r2:.6f}")
print(f"R2 Ridge regression with StandardScaler  & PolynomialFeatures(degree=2): {Ridge_StScal_Polynom_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler    & PolynomialFeatures(degree=2): {Ridge_MinMaxScal_Polynom_r2:.6f}")
print(f"R2 Lasso regression with StandardScaler  & PolynomialFeatures(degree=2): {Lasso_StScal_Polynom_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler    & PolynomialFeatures(degree=2): {Lasso_MinMaxScal_Polynom_r2:.6f}")

R2 Linear regression with StandardScaler & PolynomialFeatures(degree=2): 0.806307
R2 Linear regression with MinMaxScaler   & PolynomialFeatures(degree=2): 0.803658
R2 Ridge regression with StandardScaler  & PolynomialFeatures(degree=2): 0.816295
R2 Ridge regression with MinMaxScaler    & PolynomialFeatures(degree=2): 0.829934
R2 Lasso regression with StandardScaler  & PolynomialFeatures(degree=2): 0.732276
R2 Lasso regression with MinMaxScaler    & PolynomialFeatures(degree=2): 0.261126


### Расширим задание - посчитаем R2 на масштабированных и немасштабированных признаках с подбором коэффициента регуляризации

In [28]:
parameters = {'regression__alpha':np.logspace(-5, 5, 11)}

# создаем pipline (PolynomialFeatures и линейная регрессия)
Linear_Polynom = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", LinearRegression())])
Linear_Polynom.fit(X_train, y_train)
Linear_Polynom.predict(X_test)
Linear_Polynom_r2 = r2_score(y_test, Linear_Polynom.predict(X_test))

# создаем pipline (PolynomialFeatures и Ridge регрессия)
Ridge_Polynom = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_Polynom.fit(X_train, y_train)
Ridge_Polynom.predict(X_test)
Ridge_Polynom_r2 = r2_score(y_test, Ridge_Polynom.predict(X_test))

# создаем pipline (PolynomialFeatures и Ridge регрессия) с подбором коэффициента с помощью GridSearchCV
Ridge_Polynom = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_Polynom_GSCV = GridSearchCV(Ridge_Polynom, parameters)
Ridge_Polynom_GSCV.fit(X_train, y_train)
Ridge_Polynom_GSCV_r2 = r2_score(y_test, Ridge_Polynom_GSCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и Ridge регрессия) с подбором коэффициента с помощью GridSearchCV
Ridge_StScal_Polynom = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_StScal_Polynom_GSCV = GridSearchCV(Ridge_StScal_Polynom, parameters)
Ridge_StScal_Polynom_GSCV.fit(X_train, y_train)
Ridge_StScal_Polynom_GSCV_r2 = r2_score(y_test, Ridge_StScal_Polynom_GSCV.predict(X_test))

# создаем pipline (MinMaxScaler, PolynomialFeatures и Ridge регрессия) с подбором коэффициента с помощью GridSearchCV
Ridge_MinMaxScal_Polynom = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Ridge())])
Ridge_MinMaxScal_Polynom_GSCV = GridSearchCV(Ridge_MinMaxScal_Polynom, parameters)
Ridge_MinMaxScal_Polynom_GSCV.fit(X_train, y_train)
Ridge_MinMaxScal_Polynom_GSCV_r2 = r2_score(y_test, Ridge_MinMaxScal_Polynom_GSCV.predict(X_test))

# создаем pipline (PolynomialFeatures и RidgeCV для подбора коэффициента)
Ridge_Polynom_RidCV = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", RidgeCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью RidgeCV
Ridge_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Ridge_Polynom_RidCV_r2 = r2_score(y_test, Ridge_Polynom_RidCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и RidgeCV для подбора коэффициента)
Ridge_StScal_Polynom_RidCV = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", RidgeCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью RidgeCV
Ridge_StScal_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Ridge_StScal_Polynom_RidCV_r2 = r2_score(y_test, Ridge_StScal_Polynom_RidCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и RidgeCV для подбора коэффициента)
Ridge_MinMaxScal_Polynom_RidCV = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", RidgeCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью RidgeCV
Ridge_MinMaxScal_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Ridge_MinMaxScal_Polynom_RidCV_r2 = r2_score(y_test, Ridge_MinMaxScal_Polynom_RidCV.predict(X_test))

# создаем pipline (PolynomialFeatures и Lasso регрессия)
Lasso_Polynom = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_Polynom.fit(X_train, y_train)
Lasso_Polynom.predict(X_test)
Lasso_Polynom_r2 = r2_score(y_test, Lasso_Polynom.predict(X_test))

# создаем pipline (PolynomialFeatures и Lasso регрессия) с подбором коэффициента с помощью GridSearchCV
Lasso_Polynom = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_Polynom_GSCV = GridSearchCV(Lasso_Polynom, parameters)
Lasso_Polynom_GSCV.fit(X_train, y_train)
Lasso_Polynom_GSCV_r2 = r2_score(y_test, Lasso_Polynom_GSCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и Lasso регрессия) с подбором коэффициента с помощью GridSearchCV
Lasso_StScal_Polynom = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_StScal_Polynom_GSCV = GridSearchCV(Lasso_StScal_Polynom, parameters)
Lasso_StScal_Polynom_GSCV.fit(X_train, y_train)
Lasso_StScal_Polynom_GSCV_r2 = r2_score(y_test, Lasso_StScal_Polynom_GSCV.predict(X_test))

# создаем pipline (MinMaxScaler, PolynomialFeatures и Lasso регрессия) с подбором коэффициента с помощью GridSearchCV
Lasso_MinMaxScal_Polynom = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", Lasso())])
Lasso_MinMaxScal_Polynom_GSCV = GridSearchCV(Lasso_MinMaxScal_Polynom, parameters)
Lasso_MinMaxScal_Polynom_GSCV.fit(X_train, y_train)
Lasso_MinMaxScal_Polynom_GSCV_r2 = r2_score(y_test, Lasso_MinMaxScal_Polynom_GSCV.predict(X_test))

# создаем pipline (PolynomialFeatures и LassoCV для подбора коэффициента)
Lasso_Polynom_RidCV = Pipeline(steps = [("polynomial", PolynomialFeatures(degree=2)), ("regression", LassoCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью LassoCV
Lasso_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Lasso_Polynom_RidCV_r2 = r2_score(y_test, Lasso_Polynom_RidCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и LassoCV для подбора коэффициента)
Lasso_StScal_Polynom_RidCV = Pipeline(steps = [("scaler", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", LassoCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью LassoCV
Lasso_StScal_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Lasso_StScal_Polynom_RidCV_r2 = r2_score(y_test, Lasso_StScal_Polynom_RidCV.predict(X_test))

# создаем pipline (StandardScaler, PolynomialFeatures и LassoCV для подбора коэффициента)
Lasso_MinMaxScal_Polynom_RidCV = Pipeline(steps = [("scaler", MinMaxScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("regression", LassoCV(alphas=np.logspace(-5, 5, 11)))])
# подбираем коэффициент регуляризации для pipline с помощью LassoCV
Lasso_MinMaxScal_Polynom_RidCV.fit(X_train, y_train)
# рассчет R2 на тестовых данных
Lasso_MinMaxScal_Polynom_RidCV_r2 = r2_score(y_test, Lasso_MinMaxScal_Polynom_RidCV.predict(X_test))

# вывод R2
print(f"R2 Linear regression with PolynomialFeatures(degree=2):                                {Linear_Polynom_r2:.6f}")
print(f"R2 Ridge regression with PolynomialFeatures(degree=2):                                 {Ridge_Polynom_r2:.6f}")
print(f"R2 Ridge regression with PolynomialFeatures(degree=2) & GridSearchCV:                  {Ridge_Polynom_GSCV_r2:.6f}")
print(f"R2 Ridge regression with StandardScaler & PolynomialFeatures(degree=2) & GridSearchCV: {Ridge_StScal_Polynom_GSCV_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler & PolynomialFeatures(degree=2) & GridSearchCV:   {Ridge_MinMaxScal_Polynom_GSCV_r2:.6f}")
print(f"R2 Ridge regression with PolynomialFeatures(degree=2) & RidgeCV:                       {Ridge_Polynom_RidCV_r2:.6f}")
print(f"R2 Ridge regression with StandardScaler & PolynomialFeatures(degree=2) & RidgeCV:      {Ridge_StScal_Polynom_RidCV_r2:.6f}")
print(f"R2 Ridge regression with MinMaxScaler & PolynomialFeatures(degree=2) & RidgeCV:        {Ridge_MinMaxScal_Polynom_RidCV_r2:.6f}")
print(f"R2 Lasso regression with PolynomialFeatures(degree=2):                                 {Lasso_Polynom_r2:.6f}")
print(f"R2 Lasso regression with PolynomialFeatures(degree=2) & GridSearchCV:                  {Lasso_Polynom_GSCV_r2:.6f}")
print(f"R2 Lasso regression with StandardScaler & PolynomialFeatures(degree=2) & GridSearchCV: {Lasso_StScal_Polynom_GSCV_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler & PolynomialFeatures(degree=2) & GridSearchCV:   {Lasso_MinMaxScal_Polynom_GSCV_r2:.6f}")
print(f"R2 Lasso regression with PolynomialFeatures(degree=2) & LassoCV:                       {Lasso_Polynom_RidCV_r2:.6f}")
print(f"R2 Lasso regression with StandardScaler & PolynomialFeatures(degree=2) & LassoCV:      {Lasso_StScal_Polynom_RidCV_r2:.6f}")
print(f"R2 Lasso regression with MinMaxScaler & PolynomialFeatures(degree=2) & LassoCV:        {Lasso_MinMaxScal_Polynom_RidCV_r2:.6f}")

R2 Linear regression with PolynomialFeatures(degree=2):                                0.806589
R2 Ridge regression with PolynomialFeatures(degree=2):                                 0.807154
R2 Ridge regression with PolynomialFeatures(degree=2) & GridSearchCV:                  0.811453
R2 Ridge regression with StandardScaler & PolynomialFeatures(degree=2) & GridSearchCV: 0.818047
R2 Ridge regression with MinMaxScaler & PolynomialFeatures(degree=2) & GridSearchCV:   0.850063
R2 Ridge regression with PolynomialFeatures(degree=2) & RidgeCV:                       0.811453
R2 Ridge regression with StandardScaler & PolynomialFeatures(degree=2) & RidgeCV:      0.818047
R2 Ridge regression with MinMaxScaler & PolynomialFeatures(degree=2) & RidgeCV:        0.850063
R2 Lasso regression with PolynomialFeatures(degree=2):                                 0.817783
R2 Lasso regression with PolynomialFeatures(degree=2) & GridSearchCV:                  0.792634
R2 Lasso regression with StandardScaler 

In [29]:
# сведем результаты расчета r2 в таблицу
r2_result = pd.DataFrame(data = [[Linear_r2, Linear_Polynom_r2, Linear_StScal_r2, Linear_StScal_Polynom_r2, Linear_MinMaxScal_r2, Linear_MinMaxScal_Polynom_r2],
                                 [Ridge_r2, Ridge_Polynom_r2, Ridge_StScal_r2, Ridge_StScal_Polynom_r2, Ridge_MinMaxScal_r2, Ridge_MinMaxScal_Polynom_r2],
                                 [Ridge_GSCV_r2, Ridge_Polynom_GSCV_r2, Ridge_StScal_GSCV_r2, Ridge_StScal_Polynom_GSCV_r2, Ridge_MinMaxScal_GSCV_r2, Ridge_MinMaxScal_Polynom_GSCV_r2],
                                 [Ridge_RidCV_r2, Ridge_Polynom_RidCV_r2, Ridge_StScal_RidCV_r2, Ridge_StScal_Polynom_RidCV_r2, Ridge_MinMaxScal_RidCV_r2, Ridge_MinMaxScal_Polynom_RidCV_r2],
                                 [Lasso_r2, Lasso_Polynom_r2, Lasso_StScal_r2, Lasso_StScal_Polynom_r2, Lasso_MinMaxScal_r2, Lasso_MinMaxScal_Polynom_r2],
                                 [Lasso_GSCV_r2, Lasso_Polynom_GSCV_r2, Lasso_StScal_GSCV_r2, Lasso_StScal_Polynom_GSCV_r2, Lasso_MinMaxScal_GSCV_r2, Lasso_MinMaxScal_Polynom_GSCV_r2],
                                 [Lasso_LassCV_r2, Lasso_Polynom_RidCV_r2, Lasso_StScal_LassCV_r2, Lasso_StScal_Polynom_RidCV_r2, Lasso_MinMaxScal_LassCV_r2, Lasso_MinMaxScal_Polynom_RidCV_r2]],
                         index=pd.MultiIndex.from_tuples([('Linear', 'No_alpha'),
                                                            ('Ridge', 'Default'), ('Ridge', 'GridSearchCV_best'), ('Ridge', 'RidgeCV_best'),
                                                            ('Lasso', 'Default'), ('Lasso', 'GridSearchCV_best'), ('Lasso', 'LassoCV_best')],
                                                          names=['Regression', 'Select_alpha']),
                         columns = pd.MultiIndex.from_tuples([('No_scaler', 'No_polynom'), ('No_scaler', 'Degree_2'),
                                                              ('StandardScaler', 'No_polynom'), ('StandardScaler', 'Degree_2'),
                                                              ('MinMaxScaler', 'No_polynom'), ('MinMaxScaler', 'Degree_2')],
                                                             names=['Scaler', 'Polynom']))
r2_result

Unnamed: 0_level_0,Scaler,No_scaler,No_scaler,StandardScaler,StandardScaler,MinMaxScaler,MinMaxScaler
Unnamed: 0_level_1,Polynom,No_polynom,Degree_2,No_polynom,Degree_2,No_polynom,Degree_2
Regression,Select_alpha,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Linear,No_alpha,0.668759,0.806589,0.668759,0.806307,0.668759,0.803658
Ridge,Default,0.666222,0.807154,0.668462,0.816295,0.67641,0.829934
Ridge,GridSearchCV_best,0.668759,0.811453,0.668462,0.818047,0.670031,0.850063
Ridge,RidgeCV_best,0.668751,0.811453,0.665968,0.818047,0.670031,0.850063
Lasso,Default,0.667145,0.817783,0.623943,0.732276,0.257392,0.261126
Lasso,GridSearchCV_best,0.66876,0.792634,0.668759,0.812217,0.668761,0.839058
Lasso,LassoCV_best,0.66876,0.792634,0.668759,0.812217,0.668761,0.839058


Выводы:
- при добавлении попарных произведений признаков и их квадратов очень существенно улучшилось качество моделей;
- для рассматриваемых данных самое высокое значение метрики r2 для Ridge регрессии на масштабированных признаках с помощью MinMaxScaler и добавлением попарных произведений признаков и их квадратов с подбором коэффициента регуляризации.

# 7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [33]:
pipeline = Pipeline([('imputer', StandardScaler()),
                     ('polynomial', PolynomialFeatures()),
                     ('regression', Lasso())])
parameters = {'imputer':[StandardScaler(), MinMaxScaler()],
              'polynomial__degree':np.arange(1,6,1),
              'regression':[Lasso(), Ridge()],
              'regression__alpha':np.logspace(-5, 5, 11)}
select_model = GridSearchCV(pipeline, parameters, scoring='r2')
select_model.fit(X_train, y_train)
select_model.best_estimator_

In [35]:
# Лучшие подобранные параметры
select_model.best_params_

{'imputer': MinMaxScaler(),
 'polynomial__degree': 5,
 'regression': Ridge(),
 'regression__alpha': 1.0}

In [36]:
Best_model_r2 = r2_score(y_test, select_model.predict(X_test))
print(f"R2 Best model (MinMaxScaler, PolynomialFeatures(degree=5), Ridge(alpha=1)): {Best_model_r2:.6f}")

R2 Best model (MinMaxScaler, PolynomialFeatures(degree=5), Ridge(alpha=1)): 0.859305


Выводы:
- для рассматриваемых данных наилучшими параметрами (из предложенных) оказались метод масштабирования MinMaxScaler, 5-я степень полинома, Ridge регрессия с коэффициентом регуляризации 1;
- качество модели улучшилось по сравнению с предыдущими вариантами;
- существенно снизились трудозатраты на поиск оптимальных параметров.

http://archive.ics.uci.edu/ml/datasets/Adult

In [37]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [38]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# 8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [39]:
# создадим копию датафрейма без целевой переменной (признаки)
X = data.loc[:,0:13].copy()
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [40]:
# создадим копию столбца 14 датафрейма (целевая переменная)
y = data[14].copy()
# заменим значения менее 50к на 0, значения более 50к на 1
y[y == '<=50K'] = 0
y[y == '>50K'] = 1
y.head(3)

0    0
1    0
2    0
Name: 14, dtype: object

In [41]:
# преобразуем тип данных целевой переменной в int
y = y.astype(int)
y.value_counts()

0    37155
1    11687
Name: 14, dtype: int64

### результаты расчета метрик f1 будет отличаться исходя из того, что в целевой переменной будет True, а что False (<=50K или >50K), так как при разном выборе самым частым классом будет 0 или 1.

В последующих заданиях будем оперировать 2 вариантами целевой переменной:
- самое частое значение 0 (<=50K -> 0; >50K -> 1) - соответствует исходному y;
- самое частое значение 1 (<=50K -> 1; >50K -> 0) - соответствует инвертированному y (abs(y-1))

# 9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [42]:
X.isnull().sum(axis=0)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

В датасете отсутствуют пропущенные значения (знак "?" пока не трогаем).

In [43]:
# если бы были пропущенные значения (NaN)
imp = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
X_imp = pd.DataFrame(imp.fit_transform(X))
X_imp.loc[48837:48839]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States


# 10. Выберите колонки с числовыми и категориальными переменными.

In [44]:
# типы колонок
X.dtypes

0      int64
1     object
2      int64
3     object
4      int64
5     object
6     object
7     object
8     object
9     object
10     int64
11     int64
12     int64
13    object
dtype: object

In [47]:
# колонки с числовыми переменными
numerical = list(X.select_dtypes('number').columns)
X[numerical].head(3)

Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40


In [48]:
# колонки с категориальными переменными
categorical = list(X.select_dtypes('object').columns)
X[categorical].head(3)

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States


# 11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [49]:
# формирование пайплайна по предобработке данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

In [50]:
# Выведим датасет с предобработанными данными
preprocessor_base.fit(X)

# числовые названия колонок извлекаются с префиксом 'x' - добавим x перед числами в списке имен категорийных колонок
categorical_extract = []
for i in categorical:
    categorical_extract.append('x' + str(i))

# извлечем названия создаваемых колонок с помощью OneHotEncoder
categorical_columns = preprocessor_base.named_transformers_['cat']['encoder'].get_feature_names(categorical_extract)
columns_base = np.append(categorical_columns, numerical)

# выведим полученный датасет (в принципе названия столбцов особо не нужны)
display(pd.DataFrame(preprocessor_base.transform(X), columns=columns_base))

Unnamed: 0,x1_?,x1_Federal-gov,x1_Local-gov,x1_Never-worked,x1_Private,x1_Self-emp-inc,x1_Self-emp-not-inc,x1_State-gov,x1_Without-pay,x3_10th,...,x13_Trinadad&Tobago,x13_United-States,x13_Vietnam,x13_Yugoslavia,0,2,4,10,11,12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959


# 12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [51]:
# определим самый частый класс целевой переменной (для <=50K -> 0; >50K -> 1)
y.value_counts()

0    37155
1    11687
Name: 14, dtype: int64

Самое частое значение целевой переменной 0 (<=50K -> 0; >50K -> 1) - соответствует исходному y

In [52]:
# создадим модель, предсказывающую наиболее частый класс
Dummy_clf = DummyClassifier(strategy="most_frequent")
# инвертируем значения целевой переменной (иначе не считается f1_score)
Dummy_clf.fit(X, y)

# расчет accurancy и f1 score
DummyCl_accuracy = accuracy_score(y, Dummy_clf.predict(X))
DummyCl_0mf_f1 = f1_score(y, Dummy_clf.predict(X))

# выведем значения accurancy и f1 score
print(f"accuracy DummyClassifier (0 most frequent): {DummyCl_accuracy:.4f}")
print(f"f1_score DummyClassifier (0 most frequent): {DummyCl_0mf_f1:.4f}")

accuracy DummyClassifier (0 most frequent): 0.7607
f1_score DummyClassifier (0 most frequent): 0.0000


Самое частое значение целевой переменной 1 (<=50K -> 1; >50K -> 0) - соответствует соответствует инвертированному y (abs(y-1))

In [53]:
# создадим модель, предсказывающую наиболее частый класс
Dummy_clf = DummyClassifier(strategy="most_frequent")
# инвертируем значения целевой переменной (иначе не считается f1_score)
Dummy_clf.fit(X, abs(y-1))

# расчет accurancy и f1 score
DummyCl_accuracy = accuracy_score(abs(y-1), Dummy_clf.predict(X))
DummyCl_1mf_f1 = f1_score(abs(y-1), Dummy_clf.predict(X))

# выведем значения accurancy и f1 score
print(f"accuracy DummyClassifier (1 most frequent): {DummyCl_accuracy:.4f}")
print(f"f1_score DummyClassifier (1 most frequent): {DummyCl_1mf_f1:.4f}")

accuracy DummyClassifier (1 most frequent): 0.7607
f1_score DummyClassifier (1 most frequent): 0.8641


### посчитаем accurancy и f1 score аналитически

Самое частое значение целевой переменной 0 (<=50K -> 0; >50K -> 1) - соответствует исходному y

In [54]:
# Сформируем датасет с предсказанным значением и истинным
df = pd.DataFrame(data = {'y_predict':0, 'y_real':y})
# Добавим колонку confusion_matrix
df['confusion_matrix'] = np.NaN
for i in range(len(df)):
    if (df['y_predict'][i] == 1) & (df['y_real'][i] == 1): df.iat[i,2] = 'TP'
    if (df['y_predict'][i] == 1) & (df['y_real'][i] == 0): df.iat[i,2] = 'FP'
    if (df['y_predict'][i] == 0) & (df['y_real'][i] == 1): df.iat[i,2] = 'FN'
    if (df['y_predict'][i] == 0) & (df['y_real'][i] == 0): df.iat[i,2] = 'TN'
df.tail(2)

Unnamed: 0,y_predict,y_real,confusion_matrix
48840,0,0,TN
48841,0,1,FN


Доля правильных ответов(аккуратность): $ 𝑎𝑐𝑐𝑢𝑟𝑎𝑐𝑦(𝑎,𝑋) = \frac{1}{l} \sum_{i=1}^l[𝑎(𝑥_𝑖)=𝑦_𝑖  ] $  
$ Точность (precision) = \frac{𝑇𝑃}{(𝑇𝑃+𝐹𝑃)} $  
$ Полнота (recall) = \frac{𝑇𝑃}{(𝑇𝑃+𝐹N)} $  
F1-мера $ F1 = \frac {2∗precision∗recall}{precision+recall} $

In [56]:
# Доля правильных ответов (accurancy) будет равна отношению суммы true positive (TP) и true negative (TP) к количеству предсказанных ответов
most_freq_accuracy = df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'TN')]['confusion_matrix'].count()/df[df.confusion_matrix.notnull()]['confusion_matrix'].count()
print(f"accuracy most frequent (0 most frequent):  {most_freq_accuracy:.4f}")

# точность и полнота
most_freq_0mf_precision = df[(df.confusion_matrix == 'TP')]['confusion_matrix'].count()/df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'FP')]['confusion_matrix'].count()
most_freq_0mf_recall = df[(df.confusion_matrix == 'TP')]['confusion_matrix'].count()/df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'FN')]['confusion_matrix'].count()
print(f"precision most frequent (0 most frequent): {most_freq_0mf_precision:.4f}")
print(f"recall most frequent (0 most frequent):    {most_freq_0mf_recall:.4f}")

# f1
most_freq_0mf_f1 = (2*most_freq_0mf_precision*most_freq_0mf_recall)/(most_freq_0mf_precision+most_freq_0mf_recall)
print(f"f1 most frequent (0 most frequent):        {most_freq_0mf_f1:.4f}")

accuracy most frequent (0 most frequent):  0.7607
precision most frequent (0 most frequent): nan
recall most frequent (0 most frequent):    0.0000
f1 most frequent (0 most frequent):        nan


Самое частое значение целевой переменной 1 (<=50K -> 1; >50K -> 0) - соответствует инвертированному y (abs(y-1))

In [57]:
# Сформируем датасет с предсказанным значением и истинным
df = pd.DataFrame(data = {'y_predict':1, 'y_real':abs(y-1)})
# Добавим колонку confusion_matrix
df['confusion_matrix'] = np.NaN
for i in range(len(df)):
    if (df['y_predict'][i] == 1) & (df['y_real'][i] == 1): df.iat[i,2] = 'TP'
    if (df['y_predict'][i] == 1) & (df['y_real'][i] == 0): df.iat[i,2] = 'FP'
    if (df['y_predict'][i] == 0) & (df['y_real'][i] == 1): df.iat[i,2] = 'FN'
    if (df['y_predict'][i] == 0) & (df['y_real'][i] == 0): df.iat[i,2] = 'TN'
df.tail(2)

Unnamed: 0,y_predict,y_real,confusion_matrix
48840,1,1,TP
48841,1,0,FP


In [58]:
# Доля правильных ответов (accurancy) будет равна отношению суммы true positive (TP) и true negative (TP) к количеству предсказанных ответов
most_freq_accuracy = df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'TN')]['confusion_matrix'].count()/df[df.confusion_matrix.notnull()]['confusion_matrix'].count()
print(f"accuracy most frequent (1 most frequent):  {most_freq_accuracy:.4f}")

# точность и полнота
most_freq_precision = df[(df.confusion_matrix == 'TP')]['confusion_matrix'].count()/df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'FP')]['confusion_matrix'].count()
most_freq_recall = df[(df.confusion_matrix == 'TP')]['confusion_matrix'].count()/df[(df.confusion_matrix == 'TP') | (df.confusion_matrix == 'FN')]['confusion_matrix'].count()
print(f"precision most frequent (1 most frequent): {most_freq_precision:.4f}")
print(f"recall most frequent (1 most frequent):    {most_freq_recall:.4f}")

# f1
most_freq_1mf_f1 = (2*most_freq_precision*most_freq_recall)/(most_freq_precision+most_freq_recall)
print(f"f1 most frequent (1 most frequent):        {most_freq_1mf_f1:.4f}")

accuracy most frequent (1 most frequent):  0.7607
precision most frequent (1 most frequent): 0.7607
recall most frequent (1 most frequent):    1.0000
f1 most frequent (1 most frequent):        0.8641


Выоды:
- расчеты совпали;
- при предсказании только самого частого класса в целевой переменной accurancy = 0.7607 независимо от того, 0 самое частое значение или 1;
- при предсказании только самого частого класса в целевой переменной f1 зависит от того, 0 или 1 самое частое значение целевой переменной;
- при самом частом значении целевой переменной 0 (<=50K -> 0; >50K -> 1) f1 = 0;
- при самом частом значении целевой переменной 1 (<=50K -> 1; >50K -> 0) f1 = 0.8641

# 13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score. Напишите удалось ли превзойти предыдущий результат.

In [59]:
# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

для LogisticRegression

In [60]:
# создание пайплайна
pipe_LogReg = Pipeline([('preprocessor', preprocessor_base), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])

# расчет cross_val_score по метрикам accurancy и f1
LogReg_accuracy = cross_val_score(pipe_LogReg, X, y, cv=5, scoring='accuracy').mean()
LogReg_0mf_f1 = cross_val_score(pipe_LogReg, X, y, cv=5, scoring='f1').mean()
LogReg_1mf_f1 = cross_val_score(pipe_LogReg, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LogisticRegression cross_val_score:             {LogReg_accuracy:.4f}")
print(f"f1 LogisticRegression cross_val_score (0 most frequent): {LogReg_0mf_f1:.4f}")
print(f"f1 LogisticRegression cross_val_score (1 most frequent): {LogReg_1mf_f1:.4f}")

accuracy LogisticRegression cross_val_score:             0.8509
f1 LogisticRegression cross_val_score (0 most frequent): 0.6558
f1 LogisticRegression cross_val_score (1 most frequent): 0.9048


для SVC

In [63]:
# создание пайплайна
pipe_SVC = Pipeline([('preprocessor', preprocessor_base), ('classifier', SVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
SVC_accuracy = cross_val_score(pipe_SVC, X, y, cv=5, scoring='accuracy').mean()
SVC_0mf_f1 = cross_val_score(pipe_SVC, X, y, cv=5, scoring='f1').mean()
SVC_1mf_f1 = cross_val_score(pipe_SVC, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy SVC cross_val_score:             {SVC_accuracy:.4f}")
print(f"f1 SVC cross_val_score (0 most frequent): {SVC_0mf_f1:.4f}")
print(f"f1 SVC cross_val_score (1 most frequent): {SVC_1mf_f1:.4f}")

accuracy SVC cross_val_score:             0.8400
f1 SVC cross_val_score (0 most frequent): 0.6201
f1 SVC cross_val_score (1 most frequent): 0.8986


для LinearSVC

In [65]:
# создание пайплайна
pipe_LinSVC = Pipeline([('preprocessor', preprocessor_base), ('classifier', LinearSVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
LinSVC_accuracy = cross_val_score(pipe_LinSVC, X, y, cv=5, scoring='accuracy').mean()
LinSVC_0mf_f1 = cross_val_score(pipe_LinSVC, X, y, cv=5, scoring='f1').mean()
LinSVC_1mf_f1 = cross_val_score(pipe_LinSVC, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LinearSVC cross_val_score:             {LinSVC_accuracy:.4f}")
print(f"f1 LinearSVC cross_val_score (0 most frequent): {LinSVC_0mf_f1:.4f}")
print(f"f1 LinearSVC cross_val_score (1 most frequent): {LinSVC_1mf_f1:.4f}")

accuracy LinearSVC cross_val_score:             0.8529
f1 LinearSVC cross_val_score (0 most frequent): 0.6578
f1 LinearSVC cross_val_score (1 most frequent): 0.9063


In [66]:
# сведем результаты в таблицу
cross_val_score_result = pd.DataFrame(data = [[DummyCl_accuracy, LogReg_accuracy, SVC_accuracy, LinSVC_accuracy],
                                              [DummyCl_0mf_f1, LogReg_0mf_f1, SVC_0mf_f1, LinSVC_0mf_f1],
                                              [DummyCl_1mf_f1, LogReg_1mf_f1, SVC_1mf_f1, LinSVC_1mf_f1],
                                              ],
                                      index = ['accuracy', 'f1_0_most_frequent', 'f1_1_most_frequent'],
                                      columns = ['DummyClassifier', 'LogisticRegression', 'SVC', 'LinearSVC'])
cross_val_score_result

Unnamed: 0,DummyClassifier,LogisticRegression,SVC,LinearSVC
accuracy,0.760718,0.850907,0.839974,0.852914
f1_0_most_frequent,0.0,0.655816,0.620083,0.657816
f1_1_most_frequent,0.8641,0.904843,0.898648,0.906322


Выводы:
- cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score значительно выше, чем по алгоритму DummyClassifier (предсказание наиболее частого класса);
- результаты по алгоритмам LogisticRegression и LinearSVC сопоставимы, LinearSVC в отработал быстрее (в моем случае примерно в 2 раза)
- результаты по алгоритму SVC (метод опорных векторов) несколько хуже, чем по алгоритмам LogisticRegression и LinearSVC;
- SVC отработал в разы дольше, чем LogisticRegression и LinearSVC (в моем случае в 57 и 118 раз соответственно);
- во всех случаях метрика f1 лучше, если наиболее частым классом является положительный класс (1)

самое частое значение целевой переменной  0 (<=50K -> 0; >50K -> 1) - соответствует исходному y;
самое частое значение целевой переменной  1 (<=50K -> 1; >50K -> 0) - соответствует инвертированному y (abs(y-1))

# 14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [71]:
# пайплайн по предобработке данных без обработки пропусков
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

# Выведим датасет с предобработанными данными
preprocessor_base.fit(X)

# числовые названия колонок извлекаются с префиксом 'x' - добавим x перед числами в списке имен категорийных колонок
categorical_extract = []
for i in categorical:
    categorical_extract.append('x' + str(i))

# извлечем названия создаваемых колонок с помощью OneHotEncoder
categorical_columns = preprocessor_base.named_transformers_['cat']['encoder'].get_feature_names(categorical_extract)
columns_base = np.append(categorical_columns, numerical)

In [69]:
# формирование пайплайна по предобработке данных с заменой пропусков (?) на самые частые значения
# ? может присутствовать только в нечисловых столбцах -  добавим в preprocessor для категорийных данных SimpleImputer
preprocessor_imputer = ColumnTransformer([('cat', Pipeline([('imputer', SimpleImputer(missing_values='?', strategy="most_frequent")), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                          ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                          ])

Сравним датасеты

In [72]:
print('Датасет без обработки пропусков:')
# исходный датасет
display(pd.DataFrame(preprocessor_base.transform(X), columns=columns_base))

print('Датасет, в котором пропуски (?) заменены на самые частые значения:')
# Выведим датасет с предобработанными данными, в котором пропуски (?) заменены на самые частые значения
preprocessor_imputer.fit(X)
# числовые названия колонок извлекаются с префиксом 'x' - добавим x перед числами в списке имен категорийных колонок
categorical_extract = []
for i in categorical:
    categorical_extract.append('x' + str(i))

# извлечем названия создаваемых колонок с помощью OneHotEncoder
categorical_columns = preprocessor_imputer.named_transformers_['cat']['encoder'].get_feature_names(categorical_extract)
columns_imputer = np.append(categorical_columns, numerical)

# датасет, в котором пропуски (?) замнены на самые частые значения
display(pd.DataFrame(preprocessor_imputer.transform(X), columns=columns_imputer))

Датасет без обработки пропусков:


Unnamed: 0,x1_?,x1_Federal-gov,x1_Local-gov,x1_Never-worked,x1_Private,x1_Self-emp-inc,x1_Self-emp-not-inc,x1_State-gov,x1_Without-pay,x3_10th,...,x13_Trinadad&Tobago,x13_United-States,x13_Vietnam,x13_Yugoslavia,0,2,4,10,11,12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959


Датасет, в котором пропуски (?) заменены на самые частые значения:


Unnamed: 0,x1_Federal-gov,x1_Local-gov,x1_Never-worked,x1_Private,x1_Self-emp-inc,x1_Self-emp-not-inc,x1_State-gov,x1_Without-pay,x3_10th,x3_11th,...,x13_Trinadad&Tobago,x13_United-States,x13_Vietnam,x13_Yugoslavia,0,2,4,10,11,12
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143
48838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959
48839,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000
48840,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959


Вывод:
- стало на 3 столбца меньше, так как больше нет значения "?";
- пропуски были в 3 категорийных столбцах.

# 15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [73]:
# предобработка данных (? может присутствовать только в текстовых данных)#
# добавим в preprocessor для категорийных данных SimpleImputer
preprocessor_imputer = ColumnTransformer([('cat', Pipeline([('imputer', SimpleImputer(missing_values='?', strategy="most_frequent")), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                          ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                          ])

для LogisticRegression (в данных пропуски (?) заменены самыми частыми значениями)

In [74]:
# создание пайплайна
pipe_LogReg_SimpImp = Pipeline([('preprocessor', preprocessor_imputer), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])

# расчет cross_val_score по метрикам accurancy и f1
LogReg_SimpImp_accuracy = cross_val_score(pipe_LogReg_SimpImp, X, y, cv=5, scoring='accuracy').mean()
LogReg_SimpImp_0mf_f1 = cross_val_score(pipe_LogReg_SimpImp, X, y, cv=5, scoring='f1').mean()
LogReg_SimpImp_1mf_f1 = cross_val_score(pipe_LogReg_SimpImp, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LogisticRegression with SimpleImputer cross_val_score:             {LogReg_SimpImp_accuracy:.4f}")
print(f"f1 LogisticRegression with SimpleImputer cross_val_score (0 most frequent): {LogReg_SimpImp_0mf_f1:.4f}")
print(f"f1 LogisticRegression with SimpleImputer cross_val_score (1 most frequent): {LogReg_SimpImp_1mf_f1:.4f}")

accuracy LogisticRegression with SimpleImputer cross_val_score:             0.8507
f1 LogisticRegression with SimpleImputer cross_val_score (0 most frequent): 0.6544
f1 LogisticRegression with SimpleImputer cross_val_score (1 most frequent): 0.9048


для SVC (в данных пропуски (?) заменены самыми частыми значениями)

In [75]:
# создание пайплайна
pipe_SVC_SimpImp = Pipeline([('preprocessor', preprocessor_imputer), ('classifier', SVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
SVC_SimpImp_accuracy = cross_val_score(pipe_SVC_SimpImp, X, y, cv=5, scoring='accuracy').mean()
SVC_SimpImp_0mf_f1 = cross_val_score(pipe_SVC_SimpImp, X, y, cv=5, scoring='f1').mean()
SVC_SimpImp_1mf_f1 = cross_val_score(pipe_SVC_SimpImp, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy SVC with SimpleImputer cross_val_score:             {SVC_SimpImp_accuracy:.4f}")
print(f"f1 SVC with SimpleImputer cross_val_score (0 most frequent): {SVC_SimpImp_0mf_f1:.4f}")
print(f"f1 SVC with SimpleImputer cross_val_score (1 most frequent): {SVC_SimpImp_1mf_f1:.4f}")

accuracy SVC with SimpleImputer cross_val_score:             0.8395
f1 SVC with SimpleImputer cross_val_score (0 most frequent): 0.6167
f1 SVC with SimpleImputer cross_val_score (1 most frequent): 0.8985


для LinearSVC (в данных пропуски (?) заменены самыми частыми значениями)

In [76]:
# создание пайплайна
pipe_LinSVC_SimpImp = Pipeline([('preprocessor', preprocessor_imputer), ('classifier', LinearSVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
LinSVC_SimpImp_accuracy = cross_val_score(pipe_LinSVC_SimpImp, X, y, cv=5, scoring='accuracy').mean()
LinSVC_SimpImp_0mf_f1 = cross_val_score(pipe_LinSVC_SimpImp, X, y, cv=5, scoring='f1').mean()
LinSVC_SimpImp_1mf_f1 = cross_val_score(pipe_LinSVC_SimpImp, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LinearSVC with SimpleImputer cross_val_score:             {LinSVC_SimpImp_accuracy:.4f}")
print(f"f1 LinearSVC with SimpleImputer cross_val_score (0 most frequent): {LinSVC_SimpImp_0mf_f1:.4f}")
print(f"f1 LinearSVC with SimpleImputer cross_val_score (1 most frequent): {LinSVC_SimpImp_1mf_f1:.4f}")

accuracy LinearSVC with SimpleImputer cross_val_score:             0.8513
f1 LinearSVC with SimpleImputer cross_val_score (0 most frequent): 0.6519
f1 LinearSVC with SimpleImputer cross_val_score (1 most frequent): 0.9054


In [77]:
# Сведем все данные в таблицу
cross_val_score_result = pd.DataFrame(data = [[DummyCl_accuracy, LogReg_accuracy, LogReg_SimpImp_accuracy, SVC_accuracy, SVC_SimpImp_accuracy, LinSVC_accuracy, LinSVC_SimpImp_accuracy],
                                              [DummyCl_0mf_f1,   LogReg_0mf_f1,   LogReg_SimpImp_0mf_f1,   SVC_0mf_f1,   SVC_SimpImp_0mf_f1,   LinSVC_0mf_f1,   LinSVC_SimpImp_0mf_f1],
                                              [DummyCl_1mf_f1,   LogReg_1mf_f1,   LogReg_SimpImp_1mf_f1,   SVC_1mf_f1,   SVC_SimpImp_1mf_f1,   LinSVC_1mf_f1,   LinSVC_SimpImp_1mf_f1]],
                                      index = ['accuracy', 'f1_0_most_frequent', 'f1_1_most_frequent'],
                                      columns = pd.MultiIndex.from_tuples([('Dummy_classifier', 'No_modified'),
                                                                           ('LogisticRegression', 'No_modified'), ('LogisticRegression', 'SimpleImputer'),
                                                                           ('SVC', 'No_modified'), ('SVC', 'SimpleImputer'),
                                                                           ('LinearSVC', 'No_modified'), ('LinearSVC', 'SimpleImputer'),
                                                                          ],
                                                                          names=['Algorithmm', 'Data_preparation']))
cross_val_score_result

Algorithmm,Dummy_classifier,LogisticRegression,LogisticRegression,SVC,SVC,LinearSVC,LinearSVC
Data_preparation,No_modified,No_modified,SimpleImputer,No_modified,SimpleImputer,No_modified,SimpleImputer
accuracy,0.760718,0.850907,0.850682,0.839974,0.839503,0.852914,0.851255
f1_0_most_frequent,0.0,0.655816,0.654437,0.620083,0.616722,0.657816,0.651865
f1_1_most_frequent,0.8641,0.904843,0.904765,0.898648,0.898483,0.906322,0.905422


Вывод: замена пропущенных значений на самые частые несколько ухудшило метрики для рассмотренных алгоритмов.

# 16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

для избежания перекоса индексов в X и y нужно в исходном датасете удалить строки, содержащие '?' и заново разбить его на признаки и целевую переменную

In [78]:
# заменим знак ? на NaN и удалим строки, содержащие хотя бы 1 пропуск
# создадим копию датафрейма без целевой переменной (признаки)
X_dropna = data.replace({'?': np.nan}).dropna().loc[:,0:13].copy()
X_dropna.tail(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States


In [79]:
# заменим знак ? на NaN и удалим строки, содержащие хотя бы 1 пропуск
# создадим копию столбца 14 датафрейма (целевая переменная)
y_dropna = data.replace({'?': np.nan}).dropna()[14].copy()
# заменим значения менее 50к на 0, значения более 50к на 1
y_dropna[y_dropna == '<=50K'] = 0
y_dropna[y_dropna == '>50K'] = 1
# преобразуем тип данных целевой переменной в int
y_dropna = y_dropna.astype(int)
y_dropna.value_counts()

0    34014
1    11208
Name: 14, dtype: int64

In [80]:
# альтернативная запись - короче код, но дольше выполняется
# определим индексы строк, в которых нет пропуска (?)
index_notnan = (X != '?').all(axis=1)
# создадим копии X и y со строками без пропусков
X_dropna = X[index_notnan]
y_dropna = y[index_notnan]
display(X_dropna.tail(3))
display(y_dropna.value_counts())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States


0    34014
1    11208
Name: 14, dtype: int64

In [81]:
# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

для LogisticRegression (в данных строки с пропусками (?) удалены)

In [82]:
# создание пайплайна
pipe_LogReg_DropNA = Pipeline([('preprocessor', preprocessor_base), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])

# расчет cross_val_score по метрикам accurancy и f1
LogReg_DropNA_accuracy = cross_val_score(pipe_LogReg_DropNA, X_dropna, y_dropna, cv=5, scoring='accuracy').mean()
LogReg_DropNA_0mf_f1 = cross_val_score(pipe_LogReg_DropNA, X_dropna, y_dropna, cv=5, scoring='f1').mean()
LogReg_DropNA_1mf_f1 = cross_val_score(pipe_LogReg_DropNA, X_dropna, abs(y_dropna-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LogisticRegression with DropNA cross_val_score:             {LogReg_DropNA_accuracy:.4f}")
print(f"f1 LogisticRegression with DropNA cross_val_score (0 most frequent): {LogReg_DropNA_0mf_f1:.4f}")
print(f"f1 LogisticRegression with DropNA cross_val_score (1 most frequent): {LogReg_DropNA_1mf_f1:.4f}")

accuracy LogisticRegression with DropNA cross_val_score:             0.8468
f1 LogisticRegression with DropNA cross_val_score (0 most frequent): 0.6602
f1 LogisticRegression with DropNA cross_val_score (1 most frequent): 0.9011


для SVC (в данных строки с пропусками (?) удалены)

In [83]:
# создание пайплайна
pipe_SVC_DropNA = Pipeline([('preprocessor', preprocessor_base), ('classifier', SVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
SVC_DropNA_accuracy = cross_val_score(pipe_SVC_DropNA, X_dropna, y_dropna, cv=5, scoring='accuracy').mean()
SVC_DropNA_0mf_f1 = cross_val_score(pipe_SVC_DropNA, X_dropna, y_dropna, cv=5, scoring='f1').mean()
SVC_DropNA_1mf_f1 = cross_val_score(pipe_SVC_DropNA, X_dropna, abs(y_dropna-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy SVC with DropNA cross_val_score:             {SVC_DropNA_accuracy:.4f}")
print(f"f1 SVC with DropNA cross_val_score (0 most frequent): {SVC_DropNA_0mf_f1:.4f}")
print(f"f1 SVC with DropNA cross_val_score (1 most frequent): {SVC_DropNA_1mf_f1:.4f}")

accuracy SVC with DropNA cross_val_score:             0.8357
f1 SVC with DropNA cross_val_score (0 most frequent): 0.6267
f1 SVC with DropNA cross_val_score (1 most frequent): 0.8947


для LinearSVC (в данных строки с пропусками (?) удалены)

In [84]:
# создание пайплайна
pipe_LinSVC_DropNA = Pipeline([('preprocessor', preprocessor_base), ('classifier', LinearSVC(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
LinSVC_DropNA_accuracy = cross_val_score(pipe_LinSVC_DropNA, X_dropna, y_dropna, cv=5, scoring='accuracy').mean()
LinSVC_DropNA_0mf_f1 = cross_val_score(pipe_LinSVC_DropNA, X_dropna, y_dropna, cv=5, scoring='f1').mean()
LinSVC_DropNA_1mf_f1 = cross_val_score(pipe_LinSVC_DropNA, X_dropna, abs(y_dropna-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy LinearSVC with DropNA cross_val_score:             {LinSVC_DropNA_accuracy:.4f}")
print(f"f1 LinearSVC with DropNA cross_val_score (0 most frequent): {LinSVC_DropNA_0mf_f1:.4f}")
print(f"f1 LinearSVC with DropNA cross_val_score (1 most frequent): {LinSVC_DropNA_1mf_f1:.4f}")

accuracy LinearSVC with DropNA cross_val_score:             0.8485
f1 LinearSVC with DropNA cross_val_score (0 most frequent): 0.6616
f1 LinearSVC with DropNA cross_val_score (1 most frequent): 0.9024


In [85]:
# Сведем все данные в таблицу
cross_val_score_result = pd.DataFrame(data = [[DummyCl_accuracy, LogReg_accuracy, LogReg_SimpImp_accuracy, LogReg_DropNA_accuracy, SVC_accuracy, SVC_SimpImp_accuracy, SVC_DropNA_accuracy, LinSVC_accuracy, LinSVC_SimpImp_accuracy, LinSVC_DropNA_accuracy],
                                              [DummyCl_0mf_f1,   LogReg_0mf_f1,   LogReg_SimpImp_0mf_f1,   LogReg_DropNA_0mf_f1,   SVC_0mf_f1,   SVC_SimpImp_0mf_f1,   SVC_DropNA_0mf_f1,   LinSVC_0mf_f1,   LinSVC_SimpImp_0mf_f1,   LinSVC_DropNA_0mf_f1],
                                              [DummyCl_1mf_f1,   LogReg_1mf_f1,   LogReg_SimpImp_1mf_f1,   LogReg_DropNA_1mf_f1,   SVC_1mf_f1,   SVC_SimpImp_1mf_f1,   SVC_DropNA_1mf_f1,   LinSVC_1mf_f1,   LinSVC_SimpImp_1mf_f1,   LinSVC_DropNA_1mf_f1]],
                                      index = ['accuracy', 'f1_0_most_frequent', 'f1_1_most_frequent'],
                                      columns = pd.MultiIndex.from_tuples([('Dummy_classifier', 'No_modified'),
                                                                           ('LogisticRegression', 'No_modified'), ('LogisticRegression', 'SimpleImputer'), ('LogisticRegression', 'DropNA'),
                                                                           ('SVC', 'No_modified'), ('SVC', 'SimpleImputer'), ('SVC', 'DropNA'),
                                                                           ('LinearSVC', 'No_modified'), ('LinearSVC', 'SimpleImputer'), ('LinearSVC', 'DropNA'),
                                                                          ],
                                                                          names=['Algorithmm', 'Data_preparation']))
cross_val_score_result

Algorithmm,Dummy_classifier,LogisticRegression,LogisticRegression,LogisticRegression,SVC,SVC,SVC,LinearSVC,LinearSVC,LinearSVC
Data_preparation,No_modified,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA
accuracy,0.760718,0.850907,0.850682,0.846845,0.839974,0.839503,0.8357,0.852914,0.851255,0.848503
f1_0_most_frequent,0.0,0.655816,0.654437,0.660159,0.620083,0.616722,0.626712,0.657816,0.651865,0.661613
f1_1_most_frequent,0.8641,0.904843,0.904765,0.901146,0.898648,0.898483,0.894667,0.906322,0.905422,0.902403


Выводы:
- удаление данных, содержащих пропуски, незначительно ухудшило метрики для рассмотренных алгоритмов;
- при этом метрика f1 для данных, в котором наиболее частое значение 0, немного улучшилась для рассмотренных алгоритмов.

 # 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

cross_val_score для RandomForestClassifier

In [86]:
# на исходных данных

# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])
                                  
# создание пайплайна
pipe_RandFrst = Pipeline([('preprocessor', preprocessor_base), ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
RandFrst_accuracy = cross_val_score(pipe_RandFrst, X, y, cv=5, scoring='accuracy').mean()
RandFrst_0mf_f1 = cross_val_score(pipe_RandFrst, X, y, cv=5, scoring='f1').mean()
RandFrst_1mf_f1 = cross_val_score(pipe_RandFrst, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy RandomForestClassifier cross_val_score:             {RandFrst_accuracy:.4f}")
print(f"f1 RandomForestClassifier cross_val_score (0 most frequent): {RandFrst_0mf_f1:.4f}")
print(f"f1 RandomForestClassifier cross_val_score (1 most frequent): {RandFrst_1mf_f1:.4f}")

accuracy RandomForestClassifier cross_val_score:             0.8526
f1 RandomForestClassifier cross_val_score (0 most frequent): 0.6666
f1 RandomForestClassifier cross_val_score (1 most frequent): 0.9047


In [87]:
# на данных, в которых пропуски (?) заполнены сымым частым значением

# предобработка данных (? может присутствовать только в текстовых данных)#
# добавим в preprocessor для категорийных данных SimpleImputer
preprocessor_imputer = ColumnTransformer([('cat', Pipeline([('SimpleImputer', SimpleImputer(missing_values='?', strategy="most_frequent")), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

# создание пайплайна
pipe_RandFrst_SimpImp = Pipeline([('preprocessor', preprocessor_imputer), ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
RandFrst_SimpImp_accuracy = cross_val_score(pipe_RandFrst_SimpImp, X, y, cv=5, scoring='accuracy').mean()
RandFrst_SimpImp_0mf_f1 = cross_val_score(pipe_RandFrst_SimpImp, X, y, cv=5, scoring='f1').mean()
RandFrst_SimpImp_1mf_f1 = cross_val_score(pipe_RandFrst_SimpImp, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy RandomForestClassifier with SimpleImputer cross_val_score:             {RandFrst_SimpImp_accuracy:.4f}")
print(f"f1 RandomForestClassifier with SimpleImputer cross_val_score (0 most frequent): {RandFrst_SimpImp_0mf_f1:.4f}")
print(f"f1 RandomForestClassifier with SimpleImputer cross_val_score (1 most frequent): {RandFrst_SimpImp_1mf_f1:.4f}")

accuracy RandomForestClassifier with SimpleImputer cross_val_score:             0.8533
f1 RandomForestClassifier with SimpleImputer cross_val_score (0 most frequent): 0.6684
f1 RandomForestClassifier with SimpleImputer cross_val_score (1 most frequent): 0.9050


In [88]:
# на данных, в которых строки с пропусками (?) удалены

# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])
                                  
# создание пайплайна
pipe_RandFrst_DropNA = Pipeline([('preprocessor', preprocessor_base), ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
RandFrst_DropNA_accuracy = cross_val_score(pipe_RandFrst_DropNA, X_dropna, y_dropna, cv=5, scoring='accuracy').mean()
RandFrst_DropNA_0mf_f1 = cross_val_score(pipe_RandFrst_DropNA, X_dropna, y_dropna, cv=5, scoring='f1').mean()
RandFrst_DropNA_1mf_f1 = cross_val_score(pipe_RandFrst_DropNA, X_dropna, abs(y_dropna-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy RandomForestClassifier with DropNA cross_val_score:             {RandFrst_DropNA_accuracy:.4f}")
print(f"f1 RandomForestClassifier with DropNA cross_val_score (0 most frequent): {RandFrst_DropNA_0mf_f1:.4f}")
print(f"f1 RandomForestClassifier with DropNA cross_val_score (1 most frequent): {RandFrst_DropNA_1mf_f1:.4f}")

accuracy RandomForestClassifier with DropNA cross_val_score:             0.8490
f1 RandomForestClassifier with DropNA cross_val_score (0 most frequent): 0.6726
f1 RandomForestClassifier with DropNA cross_val_score (1 most frequent): 0.9013


cross_val_score для GradientBoostingClassifier

In [89]:
# на исходных данных

# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])
                                  
# создание пайплайна
pipe_GradBst = Pipeline([('preprocessor', preprocessor_base), ('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
GradBst_accuracy = cross_val_score(pipe_GradBst, X, y, cv=5, scoring='accuracy').mean()
GradBst_0mf_f1 = cross_val_score(pipe_GradBst, X, y, cv=5, scoring='f1').mean()
GradBst_1mf_f1 = cross_val_score(pipe_GradBst, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy GradientBoostingClassifier cross_val_score:             {GradBst_accuracy:.4f}")
print(f"f1 GradientBoostingClassifier cross_val_score (0 most frequent): {GradBst_0mf_f1:.4f}")
print(f"f1 GradientBoostingClassifier cross_val_score (1 most frequent): {GradBst_1mf_f1:.4f}")

accuracy GradientBoostingClassifier cross_val_score:             0.8676
f1 GradientBoostingClassifier cross_val_score (0 most frequent): 0.6869
f1 GradientBoostingClassifier cross_val_score (1 most frequent): 0.9160


In [90]:
# на данных, в которых пропуски (?) заполнены сымым частым значением

# предобработка данных (? может присутствовать только в текстовых данных)#
# добавим в preprocessor для категорийных данных SimpleImputer
preprocessor_imputer = ColumnTransformer([('cat', Pipeline([('SimpleImputer', SimpleImputer(missing_values='?', strategy="most_frequent")), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

# создание пайплайна
pipe_GradBst_SimpImp = Pipeline([('preprocessor', preprocessor_imputer), ('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
GradBst_SimpImp_accuracy = cross_val_score(pipe_GradBst_SimpImp, X, y, cv=5, scoring='accuracy').mean()
GradBst_SimpImp_0mf_f1 = cross_val_score(pipe_GradBst_SimpImp, X, y, cv=5, scoring='f1').mean()
GradBst_SimpImp_1mf_f1 = cross_val_score(pipe_GradBst_SimpImp, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy GradientBoostingClassifier with SimpleImputer cross_val_score:             {GradBst_SimpImp_accuracy:.4f}")
print(f"f1 GradientBoostingClassifier with SimpleImputer cross_val_score (0 most frequent): {GradBst_SimpImp_0mf_f1:.4f}")
print(f"f1 GradientBoostingClassifier with SimpleImputer cross_val_score (1 most frequent): {GradBst_SimpImp_1mf_f1:.4f}")

accuracy GradientBoostingClassifier with SimpleImputer cross_val_score:             0.8666
f1 GradientBoostingClassifier with SimpleImputer cross_val_score (0 most frequent): 0.6832
f1 GradientBoostingClassifier with SimpleImputer cross_val_score (1 most frequent): 0.9155


In [91]:
# на данных, в которых строки с пропусками (?) удалены

# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])
                                  
# создание пайплайна
pipe_GradBst_DropNA = Pipeline([('preprocessor', preprocessor_base), ('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
GradBst_DropNA_accuracy = cross_val_score(pipe_GradBst_DropNA, X_dropna, y_dropna, cv=5, scoring='accuracy').mean()
GradBst_DropNA_0mf_f1 = cross_val_score(pipe_GradBst_DropNA, X_dropna, y_dropna, cv=5, scoring='f1').mean()
GradBst_DropNA_1mf_f1 = cross_val_score(pipe_GradBst_DropNA, X_dropna, abs(y_dropna-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy GradientBoostingClassifier with DropNA cross_val_score:             {GradBst_DropNA_accuracy:.4f}")
print(f"f1 GradientBoostingClassifier with DropNA cross_val_score (0 most frequent): {GradBst_DropNA_0mf_f1:.4f}")
print(f"f1 GradientBoostingClassifier with DropNA cross_val_score (1 most frequent): {GradBst_DropNA_1mf_f1:.4f}")

accuracy GradientBoostingClassifier with DropNA cross_val_score:             0.8629
f1 GradientBoostingClassifier with DropNA cross_val_score (0 most frequent): 0.6871
f1 GradientBoostingClassifier with DropNA cross_val_score (1 most frequent): 0.9123


In [93]:
# Сведем все результаты в таблицу
cross_val_score_result = pd.DataFrame(data = [[DummyCl_accuracy, LogReg_accuracy, LogReg_SimpImp_accuracy, LogReg_DropNA_accuracy, SVC_accuracy, SVC_SimpImp_accuracy, SVC_DropNA_accuracy, LinSVC_accuracy, LinSVC_SimpImp_accuracy, LinSVC_DropNA_accuracy, RandFrst_accuracy, RandFrst_SimpImp_accuracy, RandFrst_DropNA_accuracy, GradBst_accuracy, GradBst_SimpImp_accuracy, GradBst_DropNA_accuracy],
                                              [DummyCl_0mf_f1,   LogReg_0mf_f1,   LogReg_SimpImp_0mf_f1,   LogReg_DropNA_0mf_f1,   SVC_0mf_f1,   SVC_SimpImp_0mf_f1,   SVC_DropNA_0mf_f1,   LinSVC_0mf_f1,   LinSVC_SimpImp_0mf_f1,   LinSVC_DropNA_0mf_f1, RandFrst_0mf_f1,   RandFrst_SimpImp_0mf_f1,   RandFrst_DropNA_0mf_f1,   GradBst_0mf_f1, GradBst_SimpImp_0mf_f1, GradBst_DropNA_0mf_f1],
                                              [DummyCl_1mf_f1,   LogReg_1mf_f1,   LogReg_SimpImp_1mf_f1,   LogReg_DropNA_1mf_f1,   SVC_1mf_f1,   SVC_SimpImp_1mf_f1,   SVC_DropNA_1mf_f1,   LinSVC_1mf_f1,   LinSVC_SimpImp_1mf_f1,   LinSVC_DropNA_1mf_f1, RandFrst_1mf_f1,   RandFrst_SimpImp_1mf_f1,   RandFrst_DropNA_1mf_f1,   GradBst_1mf_f1, GradBst_SimpImp_1mf_f1, GradBst_DropNA_1mf_f1]],
                                      index = ['accuracy', 'f1_0_most_frequent', 'f1_1_most_frequent'],
                                      columns = pd.MultiIndex.from_tuples([('Dummy_classifier', 'No_modified'),
                                                                           ('LogisticRegression', 'No_modified'), ('LogisticRegression', 'SimpleImputer'), ('LogisticRegression', 'DropNA'),
                                                                           ('SVC', 'No_modified'), ('SVC', 'SimpleImputer'), ('SVC', 'DropNA'),
                                                                           ('LinearSVC', 'No_modified'), ('LinearSVC', 'SimpleImputer'), ('LinearSVC', 'DropNA'),
                                                                           ('RandomForestClassifier', 'No_modified'), ('RandomForestClassifier', 'SimpleImputer'), ('RandomForestClassifier', 'DropNA'),
                                                                           ('GradientBoostingClassifier', 'No_modified'), ('GradientBoostingClassifier', 'SimpleImputer'), ('GradientBoostingClassifier', 'DropNA'),
                                                                          ],
                                                                          names=['Algorithm', 'Data_preparation']))
cross_val_score_result


Algorithm,Dummy_classifier,LogisticRegression,LogisticRegression,LogisticRegression,SVC,SVC,SVC,LinearSVC,LinearSVC,LinearSVC,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingClassifier,GradientBoostingClassifier
Data_preparation,No_modified,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA
accuracy,0.760718,0.850907,0.850682,0.846845,0.839974,0.839503,0.8357,0.852914,0.851255,0.848503,0.852565,0.853303,0.848967,0.867573,0.86659,0.862943
f1_0_most_frequent,0.0,0.655816,0.654437,0.660159,0.620083,0.616722,0.626712,0.657816,0.651865,0.661613,0.666582,0.668446,0.672592,0.686905,0.683188,0.687097
f1_1_most_frequent,0.8641,0.904843,0.904765,0.901146,0.898648,0.898483,0.894667,0.906322,0.905422,0.902403,0.90475,0.905036,0.901342,0.916026,0.915502,0.912252


Вывод: лучшие метрики получены при использовании GradientBoostingClassifier без обработки пропущенных значений (?)

# 18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [94]:
# Попытка написать функцию по удалению строк с пропусками из датасета для пайплайна
# функция не заработала - transform удаляет строки в X, а в y - нет и возникает ошибка

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer
class Drop_str_NA(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._estimator = PowerTransformer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y):
        index_notnan = (X != '?').all(axis=1)
        X_copy = X[index_notnan]
        y_copy = y[index_notnan]
        return self._estimator.transform(X_copy, y_copy)

Алгоритм выбора наилучшей модели (подбора параметров)

In [98]:
# предобработка данных
preprocessor_imputer = ColumnTransformer([('cat', Pipeline([('imputer', SimpleImputer(missing_values='?', strategy="most_frequent")), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                  ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                  ])

# методы кодирования признаков
encoders = [OneHotEncoder(handle_unknown='ignore'),
            LabelEncoder()
            ]
# методы масштабирования признаков
scalers = [StandardScaler(), MinMaxScaler()]
# рассматриваемые модели
models = [LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
          SVC(random_state=RANDOM_STATE),
          LinearSVC(random_state=RANDOM_STATE),
          RandomForestClassifier(random_state=RANDOM_STATE),
          GradientBoostingClassifier(random_state=RANDOM_STATE)
          ]

search_model = Pipeline(steps=[('preprocessor', preprocessor_imputer), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])
# Задание параметров
params = {
          'preprocessor__cat__imputer__strategy': ['mean','median','most_frequent', 'constant'],
          'preprocessor__cat__encoder': encoders,
          'preprocessor__num': scalers,
          'classifier': models
}

BestModel_SimpImp = GridSearchCV(search_model,
                                 param_grid=params,
                                 scoring=('accuracy', 'f1'),
                                 refit='f1')

In [99]:
# Подбор наилучшей модели (параметоров) для случая, когда самое частое значение целевой переменной 0 (<=50K -> 0; >50K -> 1) - соответствует исходному y
# Разбиваем выборку на обучающую и тестовую с помощью функции train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)  # добавляем random stete для повторяемости результата
# Обучаем модель
BestModel_SimpImp.fit(X_test, y_test)
# Посчитаем метрики accurancy и f1 для лучшей модели
BestModel_SimpImp_0mf_tst_accuracy = accuracy_score(y_test, BestModel_SimpImp.predict(X_test))
BestModel_SimpImp_0mf_tst_f1 = f1_score(y_test, BestModel_SimpImp.predict(X_test))

# Подбор наилучшей модели (параметоров) для случая, когда самое частое значение целевой переменной 1 (<=50K -> 1; >50K -> 0) - соответствует инвертированному y (abs(y-1))
# Разбиваем выборку на обучающую и тестовую с помощью функции train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, abs(y-1), train_size=0.8, random_state=RANDOM_STATE)  # добавляем random stete для повторяемости результата
# Обучаем модель
BestModel_SimpImp.fit(X_test, y_test)
# Посчитаем метрики accurancy и f1 для лучшей модели
BestModel_SimpImp_1mf_tst_accuracy = accuracy_score(y_test, BestModel_SimpImp.predict(X_test))
BestModel_SimpImp_1mf_tst_f1 = f1_score(y_test, BestModel_SimpImp.predict(X_test))

# Выведем параметры лучшей модули, метрики accurancy и f1
display(BestModel_SimpImp.best_params_)
print(f"accuracy best model (0 most frequent): {BestModel_SimpImp_0mf_tst_accuracy:.4f}")
print(f"f1 best model (0 most frequent):       {BestModel_SimpImp_0mf_tst_f1:.4f}")
display(BestModel_SimpImp.best_params_)
print(f"accuracy best model (1 most frequent): {BestModel_SimpImp_1mf_tst_accuracy:.4f}")
print(f"f1 best model (1 most frequent):       {BestModel_SimpImp_1mf_tst_f1:.4f}")

{'classifier': GradientBoostingClassifier(random_state=42),
 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'),
 'preprocessor__cat__imputer__strategy': 'constant',
 'preprocessor__num': StandardScaler()}

accuracy best model (0 most frequent): 0.8758
f1 best model (0 most frequent):       0.7125


{'classifier': GradientBoostingClassifier(random_state=42),
 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'),
 'preprocessor__cat__imputer__strategy': 'constant',
 'preprocessor__num': StandardScaler()}

accuracy best model (1 most frequent): 0.8758
f1 best model (1 most frequent):       0.9208


Алгоритм выбора наилучшей модели (подбора параметров) без обработки пропущенных значений

In [100]:
# предобработка данных
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical),
                                       ('num', Pipeline([('scaler', MinMaxScaler())]), numerical)
                                       ])

# методы кодирования признаков
encoders = [OneHotEncoder(handle_unknown='ignore'),
            LabelEncoder()
            ]
# методы масштабирования признаков
scalers = [StandardScaler(), MinMaxScaler()]
# рассматриваемые модели
models = [LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
          SVC(random_state=RANDOM_STATE),
          LinearSVC(random_state=RANDOM_STATE),
          RandomForestClassifier(random_state=RANDOM_STATE),
          GradientBoostingClassifier(random_state=RANDOM_STATE)
          ]

search_model = Pipeline(steps=[('preprocessor', preprocessor_base), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])
# Задание параметров
params = {
          'preprocessor__cat__encoder': encoders,
          'preprocessor__num': scalers,
          'classifier': models
          }

BestModel = GridSearchCV(search_model,
                         param_grid=params,
                         scoring=('accuracy', 'f1'),
                         refit='f1')

In [101]:
# Подбор наилучшей модели (параметоров) для случая, когда самое частое значение целевой переменной 0 (<=50K -> 0; >50K -> 1) - соответствует исходному y
# Разбиваем выборку на обучающую и тестовую с помощью функции train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)  # добавляем random stete для повторяемости результата
# Обучаем модель
BestModel.fit(X_test, y_test)
# Посчитаем метрики accurancy и f1 для лучшей модели
BestModel_0mf_tst_accuracy = accuracy_score(y_test, BestModel.predict(X_test))
BestModel_0mf_tst_f1 = f1_score(y_test, BestModel.predict(X_test))

# Подбор наилучшей модели (параметоров) для случая, когда самое частое значение целевой переменной 1 (<=50K -> 1; >50K -> 0) - соответствует инвертированному y (abs(y-1))
# Разбиваем выборку на обучающую и тестовую с помощью функции train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, abs(y-1), train_size=0.8, random_state=RANDOM_STATE)  # добавляем random stete для повторяемости результата
# Обучаем модель
BestModel.fit(X_test, y_test)
# Посчитаем метрики accurancy и f1 для лучшей модели
BestModel_1mf_tst_accuracy = accuracy_score(y_test, BestModel.predict(X_test))
BestModel_1mf_tst_f1 = f1_score(y_test, BestModel.predict(X_test))

# Выведем параметры лучшей модули, метрики accurancy и f1
display(BestModel.best_params_)
print(f"accuracy best model (0 most frequent): {BestModel_0mf_tst_accuracy:.4f}")
print(f"f1 best model (0 most frequent):       {BestModel_0mf_tst_f1:.4f}")
# Выведем параметры лучшей модули, метрики accurancy и f1
display(BestModel.best_params_)
print(f"accuracy best model (1 most frequent): {BestModel_1mf_tst_accuracy:.4f}")
print(f"f1 best model (1 most frequent):       {BestModel_1mf_tst_f1:.4f}")

{'classifier': GradientBoostingClassifier(random_state=42),
 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'),
 'preprocessor__num': StandardScaler()}

accuracy best model (0 most frequent): 0.8758
f1 best model (0 most frequent):       0.7125


{'classifier': GradientBoostingClassifier(random_state=42),
 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'),
 'preprocessor__num': StandardScaler()}

accuracy best model (1 most frequent): 0.8758
f1 best model (1 most frequent):       0.9208


Лучшими параметрами модели оказались:
- модель: GradientBoostingClassifier
- метод масштабирование признаков: StandardScaler;
- метод кодирования признаков: OneHotEncoder;
- заполнение пропусков константой либо без заполнения пропусков (видимо, замена "?" на другую константу сути пропуска не меняет).

Выводы:
- параметры модели в данном случае не зависят от самого частого значения целевой переменной;
- метрика accuracy в данном случае не зависят от самого частого значения целевой переменной (0.8758);
- метрика f1 зависит от самого частого значения целевой переменной: (0.7125 при самом частом значении 0 и 0.9208 при самом частом значении 1);
- значения метрик существенно лучше при подборе параметров модели.

Посчитаем cross_val_score для лучшей модели по метрикам accuracy и f1_score

In [102]:
# без обработки пропусков

# предобработка данных
preprocessor_best = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical),
                                       ('num', Pipeline([('scaler', StandardScaler())]), numerical)
                                       ])
                                  
# создание пайплайна
pipe_GradBst = Pipeline([('preprocessor', preprocessor_best), ('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))])

# расчет cross_val_score по метрикам accurancy и f1
BestModel_1mf_accuracy = cross_val_score(pipe_GradBst, X, y, cv=5, scoring='accuracy').mean()
BestModel_0mf_f1 = cross_val_score(pipe_GradBst, X, y, cv=5, scoring='f1').mean()
BestModel_1mf_f1 = cross_val_score(pipe_GradBst, X, abs(y-1), cv=5, scoring='f1').mean()

# вывод cross_val_score по метрикам accurancy и f1
print(f"accuracy best model cross_val_score:             {BestModel_1mf_accuracy:.4f}")
print(f"f1 best model cross_val_score (0 most frequent): {BestModel_0mf_f1:.4f}")
print(f"f1 best model cross_val_score (1 most frequent): {BestModel_1mf_f1:.4f}")

accuracy best model cross_val_score:             0.8676
f1 best model cross_val_score (0 most frequent): 0.6869
f1 best model cross_val_score (1 most frequent): 0.9160


In [106]:
# Сведем все результаты в таблицу
cross_val_score_result = pd.DataFrame(data = [[DummyCl_accuracy, LogReg_accuracy, LogReg_SimpImp_accuracy, LogReg_DropNA_accuracy, SVC_accuracy, SVC_SimpImp_accuracy, SVC_DropNA_accuracy, LinSVC_accuracy, LinSVC_SimpImp_accuracy, LinSVC_DropNA_accuracy, RandFrst_accuracy, RandFrst_SimpImp_accuracy, RandFrst_DropNA_accuracy, GradBst_accuracy, GradBst_SimpImp_accuracy, GradBst_DropNA_accuracy, BestModel_1mf_accuracy],
                                              [DummyCl_0mf_f1,   LogReg_0mf_f1,   LogReg_SimpImp_0mf_f1,   LogReg_DropNA_0mf_f1,   SVC_0mf_f1,   SVC_SimpImp_0mf_f1,   SVC_DropNA_0mf_f1,   LinSVC_0mf_f1,   LinSVC_SimpImp_0mf_f1,   LinSVC_DropNA_0mf_f1, RandFrst_0mf_f1,   RandFrst_SimpImp_0mf_f1,   RandFrst_DropNA_0mf_f1,   GradBst_0mf_f1, GradBst_SimpImp_0mf_f1, GradBst_DropNA_0mf_f1, BestModel_0mf_f1],
                                              [DummyCl_1mf_f1,   LogReg_1mf_f1,   LogReg_SimpImp_1mf_f1,   LogReg_DropNA_1mf_f1,   SVC_1mf_f1,   SVC_SimpImp_1mf_f1,   SVC_DropNA_1mf_f1,   LinSVC_1mf_f1,   LinSVC_SimpImp_1mf_f1,   LinSVC_DropNA_1mf_f1, RandFrst_1mf_f1,   RandFrst_SimpImp_1mf_f1,   RandFrst_DropNA_1mf_f1,   GradBst_1mf_f1, GradBst_SimpImp_1mf_f1, GradBst_DropNA_1mf_f1, BestModel_1mf_f1]],
                                      index = ['accuracy', 'f1_0_most_frequent', 'f1_1_most_frequent'],
                                      columns = pd.MultiIndex.from_tuples([('Dummy_classifier', 'No_modified'),
                                                                           ('LogisticRegression', 'No_modified'), ('LogisticRegression', 'SimpleImputer'), ('LogisticRegression', 'DropNA'),
                                                                           ('SVC', 'No_modified'), ('SVC', 'SimpleImputer'), ('SVC', 'DropNA'),
                                                                           ('LinearSVC', 'No_modified'), ('LinearSVC', 'SimpleImputer'), ('LinearSVC', 'DropNA'),
                                                                           ('RandomForestClassifier', 'No_modified'), ('RandomForestClassifier', 'SimpleImputer'), ('RandomForestClassifier', 'DropNA'),
                                                                           ('GradientBoostingClassifier', 'No_modified'), ('GradientBoostingClassifier', 'SimpleImputer'), ('GradientBoostingClassifier', 'DropNA'),
                                                                           ('GridSearchCV', 'BestModel'),
                                                                           ],
                                                                          names=['Algorithm', 'Data_preparation']))
cross_val_score_result

Algorithm,Dummy_classifier,LogisticRegression,LogisticRegression,LogisticRegression,SVC,SVC,SVC,LinearSVC,LinearSVC,LinearSVC,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingClassifier,GradientBoostingClassifier,GridSearchCV
Data_preparation,No_modified,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,No_modified,SimpleImputer,DropNA,BestModel
accuracy,0.760718,0.850907,0.850682,0.846845,0.839974,0.839503,0.8357,0.852914,0.851255,0.848503,0.852565,0.853303,0.848967,0.867573,0.86659,0.862943,0.867573
f1_0_most_frequent,0.0,0.655816,0.654437,0.660159,0.620083,0.616722,0.626712,0.657816,0.651865,0.661613,0.666582,0.668446,0.672592,0.686905,0.683188,0.687097,0.686874
f1_1_most_frequent,0.8641,0.904843,0.904765,0.901146,0.898648,0.898483,0.894667,0.906322,0.905422,0.902403,0.90475,0.905036,0.901342,0.916026,0.915502,0.912252,0.916028


Выводы:
- по cross_val_score метрики accuracy и f1 немного хуже, чем просто при расчете их по тестовой выборке (может, так совпали данные);
- путем подбора параметров модели не удалось преодолеть значения метрик, полученных для градиентного бустинга без обработки пропушенных значений.