## Part 3 – Model Selection

## Initial

### Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import json
import warnings
from datetime import date, time, datetime
from tqdm.notebook import tqdm

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
# from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, matthews_corrcoef, cohen_kappa_score, accuracy_score
from sklearn.inspection import permutation_importance

#!pip install optuna
import optuna
from optuna.integration import OptunaSearchCV
from optuna.distributions import FloatDistribution, IntDistribution, CategoricalDistribution

### Constants

In [2]:
PATH_LOCAL = 'datasets/'                                           # local path to data
PATH_REMOTE = '/kaggle/input/yap15-heart-diseases-predictions/'    # remote path to data

CR = '\n'                                                          # new line
RANDOM_STATE = RS = 88                                             # random_state
N_CV = 3                                                           # num of cross-val batches

N_TRIALS = 5                  # количество попыток для Optuna
TIMEOUT = 5                   # максимальный порог времени для Optuna
SCORING = 'roc_auc'           # основная метрика

### Functions

In [3]:
def custom_read_csv(file_name, separator=','):
    """
    чтение датасета в формате CSV:
      сначала из локального хранилища;
      при неудаче — из удаленного хранилища Kaggle.
    """

    path_local = f'{PATH_LOCAL}{file_name}'
    path_remote = f'{PATH_REMOTE}{file_name}'
    
    if os.path.exists(path_local):
        return pd.read_csv(path_local, sep=separator)

    elif os.path.exists(path_remote):
        return pd.read_csv(path_remote, sep=separator)

    else:
        print(f'File "{file_name}" not found at the specified path ')

### Settings

In [4]:
# text styles
class f:
    BOLD = "\033[1m"
    ITALIC = "\033[3m"
    END = "\033[0m"

In [5]:
# defaults for charts

# Matplotlib, Seaborn
PLOT_DPI = 150  # dpi for charts rendering 
sns.set_style('whitegrid', {'axes.facecolor': '0.98', 'grid.color': '0.9', 'axes.edgecolor': '1.0'})
plt.rc(
       'axes',
       labelweight='bold',
       titlesize=14,
       titlepad=10,
      )

In [6]:
# Pandas defaults
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 500
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.3f}'.format
pd.options.display.colheader_justify = 'left'

In [7]:
# оформление Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)  # отключение вывода логов при работе optuna

In [8]:
# others
warnings.filterwarnings('ignore')

## Read Data

In [9]:
data_train = pd.read_csv('C:\\Users\\Admin\\Desktop\\DS studies\\Data\\Heart diseases\\PROJECT\\For work\\FE_train.csv')  # обучающая выборка после этапа EDA
data_test = pd.read_csv('C:\\Users\\Admin\\Desktop\\DS studies\\Data\\Heart diseases\\PROJECT\\For work\\FE_test.csv')    # тестовая выборка после этапа EDA

In [10]:
data_train.head()

Unnamed: 0,cardio,age,cholesterol,gluc,gender_aphi_bined_aplo_bined_TRG_mean,gender_age_bined_weight_bined_TRG_mean,gender_cholesterol_gluc_bmi_TRG_mean,gender_smoke_alco_active_TRG_mean,gender_height_TRG_mean
0,0,50,1,1,0.263,0.373,0.303,0.496,0.502
1,1,55,3,1,0.821,0.631,1.0,0.485,0.513
2,1,52,3,1,0.529,0.462,0.667,0.523,0.466
3,1,48,1,1,0.871,0.489,0.562,0.496,0.501
4,0,48,1,1,0.194,0.264,0.299,0.523,0.513


## Данные для моделей

### Выделение признаков и целевой переменной

In [11]:
X = data_train.drop('cardio', axis=1)
Y = data_train.cardio

X.shape, Y.shape

((68753, 8), (68753,))

### Разделение на обучающую и валидационную выборки

Валидационная выборка – часть, отрезанная от train. Нужна для локальной проверки модели.

In [12]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=RS)

X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape

((55002, 8), (55002,), (13751, 8), (13751,))

## Модель

### Preprocessing

#### Селекторы числовых и категориальных признаков

In [13]:
selector_num = make_column_selector(dtype_include=np.number)
selector_cat = make_column_selector(dtype_exclude=np.number)

#### Предбработка числовых признаков

In [14]:
num_preprocessor = make_pipeline(
                                 StandardScaler(),
#                                IterativeImputer(initial_strategy='mean', random_state=RS),  # в данных нет пропусков
                                )

#### Предбработка категориальных признаков

Раздельно для линейных моделей и моделей на базе деревьев.

In [15]:
# for linear models
cat_preprocessor_linr = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')

# for tree models
cat_preprocessor_tree = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)

#### Объединение предобработки

In [16]:
# for linear models
preprocessing_linr = make_column_transformer(
                                             (num_preprocessor, selector_num),
                                             (cat_preprocessor_linr, selector_cat),
                                             remainder='passthrough'
                                            )
# for tree models
preprocessing_tree = make_column_transformer(
                                             (num_preprocessor, selector_num),
                                             (cat_preprocessor_tree, selector_cat),
                                             remainder='passthrough'
                                            )

#### Models pipelines

In [17]:
pipelines_list = [
                  Pipeline([('PT', preprocessing_tree), ('RFC', RandomForestClassifier(random_state=RS)) ]),
                  Pipeline([('PT', preprocessing_tree), ('HGBC', HistGradientBoostingClassifier(random_state=RS)) ]),
                  Pipeline([('PL', preprocessing_linr), ('SGDC', SGDClassifier(random_state=RS)) ]),
                  Pipeline([('PL', preprocessing_linr), ('LSVC', LinearSVC(dual=False, random_state=RS)) ]),
                 ]

names_list = ['RandomForestClassifier', 'HistGradientBoostingClassifier', 'SGDClassifier', 'LinearSVC']
short_names_list = ['RFC', 'HGBC', 'SGDC', 'LSVC']

models = pd.DataFrame(data={'name': names_list,
                            'short_name': short_names_list,
                            'model': pipelines_list,
                           },
                     )
models

Unnamed: 0,name,short_name,model
0,RandomForestClassifier,RFC,"(ColumnTransformer(remainder='passthrough',\n transformers=[('pipeline',\n ..."
1,HistGradientBoostingClassifier,HGBC,"(ColumnTransformer(remainder='passthrough',\n transformers=[('pipeline',\n ..."
2,SGDClassifier,SGDC,"(ColumnTransformer(remainder='passthrough',\n transformers=[('pipeline',\n ..."
3,LinearSVC,LSVC,"(ColumnTransformer(remainder='passthrough',\n transformers=[('pipeline',\n ..."


### Подбор гиперпараметров

#### Objective functions для Optuna

In [18]:
def objective_RFC(trial):

    params = {
              'RFC__n_estimators': trial.suggest_int('RFC__n_estimators', 50, 1000, log=True),
              'RFC__max_depth': trial.suggest_int('RFC__max_depth', 1, 50),
              'RFC__class_weight': trial.suggest_categorical('RFC__class_weight', ['balanced', 'balanced_subsample', None]),
             }
    model.set_params(**params)
    cv_SKF = StratifiedKFold(n_splits=5)
    
    return cross_val_score(model, X_train, Y_train, scoring=SCORING, cv=cv_SKF, n_jobs=-1).mean()

In [19]:
def objective_HGBC(trial):

    params = {
              'HGBC__max_depth': trial.suggest_int('HGBC__max_depth', 10, 100),
              'HGBC__learning_rate': trial.suggest_float('HGBC__learning_rate', 0.1, 10.0, log=True),
              'HGBC__max_iter': trial.suggest_int('HGBC__max_iter', 10, 500, log=True),
              'HGBC__l2_regularization': trial.suggest_float('HGBC__l2_regularization', 0.001, 1000),
              'HGBC__max_bins': trial.suggest_int('HGBC__max_bins', 5, 255),
             }
    model.set_params(**params)
    cv_SKF = StratifiedKFold(n_splits=5)
    
    return cross_val_score(model, X_train, Y_train, scoring=SCORING, cv=cv_SKF, n_jobs=-1).mean()

In [20]:
def objective_SGDC(trial):

    params = {
              'SGDC__loss': trial.suggest_categorical('SGDC__loss', ['hinge','log_loss','modified_huber','squared_hinge','perceptron']),
              'SGDC__penalty': trial.suggest_categorical('SGDC__penalty', ['l1', 'l2', 'elasticnet']),
              'SGDC__max_iter': trial.suggest_int('SGDC__max_iter', 5, 1000, log=True),
              'SGDC__validation_fraction': trial.suggest_float('SGDC__validation_fraction', 0.001, 0.999999),
              'SGDC__class_weight': trial.suggest_categorical('SGDC__class_weight', ['balanced', None]),
             }
    model.set_params(**params)
    cv_SKF = StratifiedKFold(n_splits=5)
    
    return cross_val_score(model, X_train, Y_train, scoring=SCORING, cv=cv_SKF, n_jobs=-1).mean()

In [21]:
def objective_LSVC(trial):

    params = {
              'LSVC__penalty': trial.suggest_categorical('LSVC__penalty', ['l1', 'l2']),
              'LSVC__max_iter': trial.suggest_int('LSVC__max_iter', 10, 1000, log=True),
              'LSVC__C': trial.suggest_float('LSVC__C', 0.01, 100, log=True),
              'LSVC__class_weight': trial.suggest_categorical('LSVC__class_weight', ['balanced', None]),
             }
    model.set_params(**params)
    cv_SKF = StratifiedKFold(n_splits=5)
    
    return cross_val_score(model, X_train, Y_train, scoring=SCORING, cv=cv_SKF, n_jobs=-1).mean()

#### Вызов Optuna

In [22]:
for item in range(models.shape[0]):
    
    model = models.loc[item,'model']
    
    # создание объекта optuna.study
    study = optuna.create_study(
                                study_name=models.loc[item,'name'],
                                direction="maximize",
                                sampler=optuna.samplers.TPESampler(seed=RS)
                               )

    # оптимизация (подбор гиперпараметров)
    if models.loc[item,'short_name'] == 'RFC':
        study.optimize(objective_RFC, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True, n_jobs=-1)
    elif models.loc[item,'short_name'] == 'HGBC':
        study.optimize(objective_HGBC, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True, n_jobs=-1)
    elif models.loc[item,'short_name'] == 'SGDC':
        study.optimize(objective_SGDC, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True, n_jobs=-1)
    elif models.loc[item,'short_name'] == 'LSVC':
        study.optimize(objective_LSVC, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True, n_jobs=-1)

    # извлечение и обучение лучшей модели
    model.set_params(**study.best_params).fit(X_train, Y_train)

    # сохранение результатов в таблице моделей
    models.loc[item,'model'] = model
    models.loc[item,'study'] = study
    
    print(f'{CR}{f.BOLD}{study.study_name}{f.END}{CR}')
    print(f'Количество попыток: {len(study.trials)}')
    print(f'Лучший результат: {f.BOLD}{study.best_value:0.4f}{f.END}{CR}')
    print('Комбинация гиперпараметров:')
    print(json.dumps(study.best_params, indent=1, sort_keys=True), f'{CR}')

  0%|          | 0/5 [00:00<?, ?it/s]


[1mRandomForestClassifier[0m

Количество попыток: 5
Лучший результат: [1m0.8500[0m

Комбинация гиперпараметров:
{
 "RFC__class_weight": "balanced_subsample",
 "RFC__max_depth": 38,
 "RFC__n_estimators": 222
} 



  0%|          | 0/5 [00:00<?, ?it/s]


[1mHistGradientBoostingClassifier[0m

Количество попыток: 5
Лучший результат: [1m0.6921[0m

Комбинация гиперпараметров:
{
 "HGBC__l2_regularization": 864.0188752714649,
 "HGBC__learning_rate": 8.54197067415154,
 "HGBC__max_bins": 201,
 "HGBC__max_depth": 42,
 "HGBC__max_iter": 29
} 



  0%|          | 0/5 [00:00<?, ?it/s]


[1mSGDClassifier[0m

Количество попыток: 5
Лучший результат: [1m0.8633[0m

Комбинация гиперпараметров:
{
 "SGDC__class_weight": "balanced",
 "SGDC__loss": "perceptron",
 "SGDC__max_iter": 653,
 "SGDC__penalty": "l2",
 "SGDC__validation_fraction": 0.7034347025075354
} 



  0%|          | 0/5 [00:00<?, ?it/s]


[1mLinearSVC[0m

Количество попыток: 5
Лучший результат: [1m0.8673[0m

Комбинация гиперпараметров:
{
 "LSVC__C": 9.55503969385683,
 "LSVC__class_weight": null,
 "LSVC__max_iter": 504,
 "LSVC__penalty": "l2"
} 

