## Неделя 2. Понедельник
### Обучение с учителем

### Применение базовых методов классификации

In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn

sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    OrdinalEncoder,
    TargetEncoder,
)
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold

# for model learning
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    cross_val_score,
)

# models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import accuracy_score


# tunning hyperparamters model
import optuna

#### 0. Ознакомьтесь с датасетом

In [134]:
df = pd.read_csv("./../data/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49.0,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37.0,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48.0,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54.0,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


* __Age__: age of the patient [years]
* __Sex__: sex of the patient [M: Male, F: Female]
* __ChestPainType__: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* __RestingBP__: resting blood pressure [mm Hg]
* __Cholesterol__: serum cholesterol [mm/dl]
* __FastingBS__: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* __RestingECG__: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite * left ventricular hypertrophy by Estes' criteria]
* __MaxHR__: maximum heart rate achieved [Numeric value between 60 and 202]
* __ExerciseAngina__: exercise-induced angina [Y: Yes, N: No]
* __Oldpeak__: oldpeak = ST [Numeric value measured in depression]
* __ST_Slope__: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* __HeartDisease__: output class [1: heart disease, 0: Normal]

* Таргетом является столбец `HeartDisease`. Необходимо предсказать по имеющимся данным, есть ли проблемы с сердцем

In [135]:
df.shape

(918, 12)

In [136]:
pd.DataFrame(data={"NaN_count": df.isna().sum(), "data_type": df.dtypes})

Unnamed: 0,NaN_count,data_type
Age,10,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [137]:
num_features = df.select_dtypes(exclude="object")
cat_features = df.select_dtypes(include="object")

In [138]:
df["MaxHR"].min()

np.int64(60)

In [139]:
# Разделяем датасет на матрицу признаков и столбец target

X, y = df.drop("HeartDisease", axis=1), df["HeartDisease"]

# Делим датасеты на train и valid в соотношении 80-20

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#### 1. Небольшие рекомендации ниже 


* __Baseline pipeline (базовый пайплайн)__ - это простой пайплайн, который используется как отправная точка или точка сравнения при разработке и оценке более сложных моделей или алгоритмов. 

* Для этого сначала используйте самые простые идеи по заполнению пропусков(средними, медианами, модами) и кодированию категориальных данных, которые вам приходят в голову. 

* После того, как вы построите модели провалидируете их. Можно будет приступать к попыткам улучшить свою модель с помощью ваших идей - пробовать создавать новые фичи, кодировать данные по-другому, заполнять иначе NaN и тд

#### 2. Заполните пропущенные значения(`Imputing`), как считаете нужным.  

- Не забывайте памятку выше, сначала заполняйте самыми тривиальными идеями. Наприсер, средними, медианами и т.д

##### 2.1 Оберните в `ColumnTransformer` свой `Imputing` данных. Проверьте корректность его работы. Для этого необходимо сделать:

1. Обучить и трансформировать свой `Imputer` с помощью `your_imputer.fit_transform` - на тренировочных данных
2. Заполнить с помощью `your_imputer.transform` - на тестовых данных

Убедитесь, что данные прошли через этап `Imputing'а` и пропусков в них больше нет

In [140]:
my_imputer = ColumnTransformer(
    transformers=[("num_imputer", SimpleImputer(strategy="mean"), ["Age"])],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

In [141]:
filled_data = my_imputer.fit_transform(X_train)
filled_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
485,63.0,M,ATA,139,217,1,ST,128,Y,1.2,Flat
486,55.0,M,ATA,110,214,1,ST,180,N,0.4,Up
117,59.0,F,ASY,130,338,1,ST,130,Y,1.5,Flat
361,47.0,M,ASY,160,0,0,Normal,124,Y,0.0,Flat
296,50.0,M,ASY,145,0,1,Normal,139,Y,0.7,Flat
...,...,...,...,...,...,...,...,...,...,...,...
276,51.0,M,NAP,135,160,0,Normal,150,N,2.0,Flat
201,46.0,M,NAP,120,230,0,Normal,150,N,0.0,Up
462,59.0,M,ASY,122,233,0,Normal,117,Y,1.3,Down
252,61.0,M,ASY,125,292,0,ST,115,Y,0.0,Up


In [142]:
pd.DataFrame(
    data={"NaN_count": filled_data.isna().sum(), "data_type": filled_data.dtypes}
)

Unnamed: 0,NaN_count,data_type
Age,0,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [143]:
# my_imputer.transform(X_valid).isna().sum()

#### 3. Закодируйте категориальные переменные, как считаете нужным

* `OneHotEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  
* `TargetEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html)  
* `OrdinalEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)  
* `CatBoostEncoding` (https://www.geeksforgeeks.org/categorical-encoding-with-catboost-encoder/)  

In [144]:
ordinal_encoding_columns = list(cat_features.columns)
ordinal_encoding_columns

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

##### 3.1 Оберните в `ColumnTransformer` свой `Encoding` данных. Проверьте корректность его работы. 

In [145]:
# my_encoder = ColumnTransformer(
#     [("ordinal_enc", OrdinalEncoder(), ordinal_encoding_columns)],
#     verbose_feature_names_out=False,
#     remainder="passthrough",
# )

#### 4. То же самое проделать с нормализацией данных

* `StandardScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* `MinMaxScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
* `RobustScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)

#### 4.1 Оберните в `ColumnTransformer` свой `Scaling` данных, проверьте корректность работы.

In [146]:
standart_scaler_columns = ["RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
# my_scaler = ColumnTransformer(
#     ["scaling_num_columns", StandardScaler(), standart_scaler_columns],
#     verbose_feature_names_out=False,
#     remainder="passthrough",
# )

In [147]:
# Объединим encoding и scaling

encode_and_scale = ColumnTransformer(
    [
        ("ordinal_enc", OrdinalEncoder(), ordinal_encoding_columns),
        ("scaling_num_columns", StandardScaler(), standart_scaler_columns),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

In [148]:
# Применяем к ТРЕНИРОВОЧНЫМ данным

processed_data = encode_and_scale.fit_transform(filled_data, y)
processed_data

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,RestingBP,Cholesterol,MaxHR,Oldpeak,Age,FastingBS
485,1.0,1.0,2.0,1.0,1.0,0.339016,0.127137,-0.324520,0.317046,63.0,1
486,1.0,1.0,2.0,0.0,2.0,-1.266031,0.099443,1.689837,-0.440356,55.0,1
117,0.0,0.0,2.0,1.0,1.0,-0.159102,1.244113,-0.247045,0.601071,59.0,1
361,1.0,0.0,1.0,1.0,1.0,1.501291,-1.876035,-0.479470,-0.819056,47.0,0
296,1.0,0.0,1.0,1.0,1.0,0.671094,-1.876035,0.101594,-0.156330,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...
276,1.0,2.0,1.0,0.0,1.0,0.117630,-0.399042,0.527708,1.074447,51.0,0
201,1.0,2.0,1.0,0.0,2.0,-0.712567,0.247142,0.527708,-0.819056,46.0,0
462,1.0,0.0,1.0,1.0,0.0,-0.601874,0.274836,-0.750634,0.411721,59.0,0
252,1.0,0.0,2.0,1.0,2.0,-0.435834,0.819477,-0.828109,-0.819056,61.0,0


In [149]:
pd.DataFrame(
    data={"NaN_count": processed_data.isna().sum(), "data_type": processed_data.dtypes}
)

Unnamed: 0,NaN_count,data_type
Sex,0,float64
ChestPainType,0,float64
RestingECG,0,float64
ExerciseAngina,0,float64
ST_Slope,0,float64
RestingBP,0,float64
Cholesterol,0,float64
MaxHR,0,float64
Oldpeak,0,float64
Age,0,float64


#### 5. Соберите весь препроцессинг в общий Pipeline.

In [150]:
preprocessor = Pipeline(
    [("imputer", my_imputer), ("scaler_and_encoder", encode_and_scale)]
)

##### 5.1 Прогоните свои данные через `preprocessor` и убедитесь, что ваши данные проходят через него корректно и уже готовы к ML-модели

In [151]:
X_train = preprocessor.fit_transform(X_train)
X_train[X_train["Age"].isna()]

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,RestingBP,Cholesterol,MaxHR,Oldpeak,Age,FastingBS


In [152]:
pd.DataFrame(data={"NaN_count": X_train.isna().sum(), "data_type": X_train.dtypes})

Unnamed: 0,NaN_count,data_type
Sex,0,float64
ChestPainType,0,float64
RestingECG,0,float64
ExerciseAngina,0,float64
ST_Slope,0,float64
RestingBP,0,float64
Cholesterol,0,float64
MaxHR,0,float64
Oldpeak,0,float64
Age,0,float64


In [153]:
X_valid = preprocessor.transform(X_valid)

In [158]:
X_valid[X_valid["Age"].isna()]

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,RestingBP,Cholesterol,MaxHR,Oldpeak,Age,FastingBS


#### 6.ML-модели

* `LogisticRegression` (из `sklearn.linear_model`)  
* `LogisticRegression with regularization` (из `sklearn.linear_model`)  
* `KNeighborsClassifier` (из `sklearn.neighbors`)  
* `DecisionTree` (из `sklearn.tree`)  

##### 6.1 Обучите свой `Pipeline` с помощью метода `.fit()` с разными моделями.

In [170]:
ml_pipeline_neighbors = Pipeline(
    [("preprocessor", preprocessor), ("model", KNeighborsClassifier())]
)
ml_pipeline_tree = Pipeline(
    [("preprocessor", preprocessor), ("model", DecisionTreeClassifier())]
)
ml_pipeline_LogReg = Pipeline(
    [("preprocessor", preprocessor), ("model", LogisticRegression(penalty=None))]
)
ml_pipeline_LogReg_with_l2 = Pipeline(
    [("preprocessor", preprocessor), ("model", LogisticRegression())]
)
ml_pipeline_LogReg_with_l1 = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("model", LogisticRegression(penalty="l1", solver="liblinear")),
    ]
)

In [173]:
ml_pipeline_neighbors.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler_and_encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal_enc', ...), ('scaling_num_columns', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [174]:
ml_pipeline_tree.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler_and_encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal_enc', ...), ('scaling_num_columns', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [175]:
ml_pipeline_LogReg.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler_and_encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal_enc', ...), ('scaling_num_columns', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [176]:
ml_pipeline_LogReg_with_l2.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler_and_encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal_enc', ...), ('scaling_num_columns', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [177]:
ml_pipeline_LogReg_with_l1.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler_and_encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal_enc', ...), ('scaling_num_columns', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


#### 7. С помощью метода `.predict()` (на вход поступают только матрица признаков, без целевой переменной) предсказать значения на обучающей выборке (`X_train`) и валидационной выборке (`X_valid`).

In [178]:
model_train_predict_1 = ml_pipeline_LogReg.predict(X_train)
model_valid_predict_1 = ml_pipeline_LogReg.predict(X_valid)

model_valid_predict_2 = ml_pipeline_LogReg_with_l1.predict(X_valid)
model_train_predict_2 = ml_pipeline_LogReg_with_l1.predict(X_train)

model_train_predict_3 = ml_pipeline_LogReg_with_l2.predict(X_train)
model_valid_predict_3 = ml_pipeline_LogReg_with_l2.predict(X_valid)

model_train_predict_4 = ml_pipeline_neighbors.predict(X_train)
model_valid_predict_4 = ml_pipeline_neighbors.predict(X_valid)

model_train_predict_5 = ml_pipeline_tree.predict(X_train)
model_valid_predict_5 = ml_pipeline_tree.predict(X_valid)

##### 7.1 С помощью функции оценки качества (`accuracy_score`) собрать следующую таблицу ниже

* значение функции на обучающих данных
* значение функции на валидационных данных 
    
Результатом выполнения этого пункта будет `DataFrame` формата: 
    
|  |train|valid|
|--|-----|-----|
|**LogReg**|  train_score  | valid_score    |
|**LogReg with l1**|  train_score  | valid_score    |
|**LogReg with l2**|  train_score  | valid_score    |
|**KNN**| train_score  |  valid_score   |
|**Tree**| train_score | valid_score    |

In [180]:
pd.DataFrame(
    data={
        "train": [
            accuracy_score(y_train, model_train_predict_1),
            accuracy_score(y_train, model_train_predict_2),
            accuracy_score(y_train, model_train_predict_3),
            accuracy_score(y_train, model_train_predict_4),
            accuracy_score(y_train, model_train_predict_5),
        ],
        "valid": [
            accuracy_score(y_valid, model_valid_predict_1),
            accuracy_score(y_valid, model_valid_predict_2),
            accuracy_score(y_valid, model_valid_predict_3),
            accuracy_score(y_valid, model_valid_predict_4),
            accuracy_score(y_valid, model_valid_predict_5),
        ],
    },
    index=["LogReg", "LogReg_with_l1", "LogReg_with_l2", "KNN", "Tree"],
)

Unnamed: 0,train,valid
LogReg,0.858311,0.86413
LogReg_with_l1,0.855586,0.880435
LogReg_with_l2,0.856948,0.875
KNN,0.866485,0.820652
Tree,1.0,0.777174


#### 8. Теперь реализуйте __кросс-валидацию__ с KFold=5 и выведите средний __score__

In [184]:
cv = StratifiedKFold(n_splits=5, random_state=23, shuffle=True)

cv_res_1 = cross_val_score(ml_pipeline_LogReg, X, y, cv=cv)

cv_res_2 = cross_val_score(ml_pipeline_LogReg_with_l1, X, y, cv=cv)

cv_res_3 = cross_val_score(ml_pipeline_LogReg_with_l2, X, y, cv=cv)

cv_res_4 = cross_val_score(ml_pipeline_neighbors, X, y, cv=cv)

cv_res_5 = cross_val_score(ml_pipeline_tree, X, y, cv=cv)

print(cv_res_1)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

[0.88043478 0.81521739 0.89673913 0.80327869 0.83606557]


In [185]:
pd.DataFrame(
    data={
        "cross_val_score": [
            cv_res_1.mean(),
            cv_res_2.mean(),
            cv_res_3.mean(),
            cv_res_4.mean(),
            cv_res_5.mean(),
        ],
    },
    index=["LogReg", "LogReg_with_l1", "LogReg_with_l2", "KNN", "Tree"],
)

Unnamed: 0,cross_val_score
LogReg,0.846347
LogReg_with_l1,0.848527
LogReg_with_l2,0.84962
KNN,0.823527
Tree,0.798503


|  |cross_val_score|
|--|-----|
|**LogReg**|  your_score |
|**LogReg with l1**|  your_score  |
|**LogReg with l2**|  your_score  |
|**KNN**| your_score  |
|**SVC**| your_score  |
|**Tree**| your_score |

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. 

1. Перейди в командной строке в папку, в которой расположен этот нотбук. 
2. Выполни команду `git add 06-01-task.ipynb`
3. Выполни команду `git commit -m "base models in progress"`
4. Выполни команду `git push`

##### 9. Теперь, когда вы проделали весь pipeline и обучили базовую модель, можно вернуться к началу и пробовать новые идеи и искать точки роста для ваших моделей, в том числе и добавление новых фичей

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Сохрани файл для __github__ и выполни команду `!git status` в ячейке ниже.


In [None]:
# code