In [87]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.linear_model import LinearRegressiont
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, \
    mean_squared_log_error, r2_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool

import optuna

import pickle

import warnings
warnings.filterwarnings('ignore')

RAND = 10
N_FOLDS = 4
alpha = 0.001

In [88]:
%run C:\Users\main6\OneDrive\Документы\jupyter\Pet_pro\notebooks\02_Baseline.ipynb

Mean absolute error train: 8.317
Mean absolute error test: 8.347
delta = 0.4 %
Mean absolute error train: 0.497
Mean absolute error test: 10.134
delta = 95.1 %
Mean absolute error train: 2.636
Mean absolute error test: 8.258
delta = 68.1 %
Mean absolute error train: 5.710
Mean absolute error test: 7.174
delta = 20.4 %
Mean absolute error train: 6.203
Mean absolute error test: 7.121
delta = 12.9 %
Mean absolute error train: 5.770
Mean absolute error test: 6.454
delta = 10.6 %


# Tuning

## 5. LightGBM

In [18]:
def objective_lgbm(trial, X, y, N_FOLDS, random_state):
    lgb_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'n_estimators': trial.suggest_categorical('n_estimators', [2000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.028673457194108982]),        
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=20),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        # борьба с переобучением
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 100),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 100),
        'min_split_gain': trial.suggest_int('min_split_gain', 0, 20),
        # доля объектов при обучении в дереве
        'subsample': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'subsample_freq': trial.suggest_categorical('bagging_freq', [1]),
        # доля признаков при обучении в дереве
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        # константы
        'objective': trial.suggest_categorical('objective', ['mae']),
        'random_state': trial.suggest_categorical('random_state', [random_state])
    }

    cv = KFold(n_splits=N_FOLDS, shuffle=True)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
           trial, metric='l1')
        model = LGBMRegressor(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric='mae',
                  callbacks=[pruning_callback],
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [19]:
study_lgbm = optuna.create_study(direction='minimize', study_name='LGB_00')
func = lambda trial: objective_lgbm(
    trial, X_train_ct, y_train_ct, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm.optimize(func, n_trials=5, show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [89]:
# смотрим на параметры
study_lgbm.best_params

{'n_estimators': 2000,
 'learning_rate': 0.028673457194108982,
 'num_leaves': 320,
 'max_depth': 13,
 'reg_alpha': 16,
 'reg_lambda': 92,
 'min_split_gain': 0,
 'bagging_fraction': 0.7836616921026176,
 'bagging_freq': 1,
 'colsample_bytree': 0.8661455708217822,
 'objective': 'mae',
 'random_state': 10}

In [90]:
lgbm_opt = LGBMRegressor(**study_lgbm.best_params)
lgbm_opt.fit(X_ct,
             y_ct,
             eval_metric='mae',
             eval_set=eval_ct,
             verbose=False,
             early_stopping_rounds=100)

y_pred = lgbm_opt.predict(X_test_ct)
y_pred_exp = np.exp(y_pred) - 1

metrics = metrics.append(
    get_metrics_regression(y_test=y_test_ct_exp,
                           y_pred=y_pred_exp,
                           X_test=X_test_ct,
                           name='LightGBM_Optuna'))
metrics



Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
0,LinearRegression_baseline,8.347072,156.85872,12.524325,0.455343,0.451267,-inf,inf,34.324438
0,DecisonTreeRegressor_baseline,10.133774,228.179589,15.105614,0.607437,0.201768,-inf,inf,41.671629
0,RandomForestRegressor_baseline,8.258138,176.019952,13.267251,0.476677,0.384236,-inf,inf,33.958726
0,XGBoost_baseline,7.173608,121.679612,11.030848,0.398242,0.574333,-inf,inf,29.498972
0,LightGBM_baseline,7.120787,122.711309,11.077514,0.393043,0.573203,-inf,inf,29.304653
0,CatBoost_baseline,6.453979,99.260909,9.962977,0.359331,0.654765,-inf,inf,26.560497
0,LightGBM_Optuna,6.44347,102.503548,10.124404,0.364783,0.643487,-inf,inf,26.517247


In [91]:
check_overfitting(lgbm_opt,
                  X_ct,
                  y_ct_exp,
                  X_test_ct,
                  y_test_ct_exp)

Mean absolute error train: 4.014
Mean absolute error test: 6.443
delta = 37.7 %


- удалось улучшить MAE в сранении с бэйзлайном самой модели и в целом
- однако процент overfitting довольно большой

## 6. CatBoost

In [29]:
def objective_cat(trial, X, y, N_FOLDS, random_state, cat_feat):
    cat_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'n_estimators': trial.suggest_categorical('n_estimators', [1148]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03993043117456255]),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5,1.0),
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 1e-5, 1e2),
        'random_strength': trial.suggest_uniform('random_strength', 10, 50),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS', 'No']),
        'border_count': trial.suggest_categorical('border_count', [128, 254]),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'od_wait': trial.suggest_int('od_wait', 500, 2000),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15),
        'loss_function': trial.suggest_categorical('loss_function', ['MAE']),
        'use_best_model': trial.suggest_categorical('use_best_model', [True]),
        'eval_metric': trial.suggest_categorical('eval_metric', ['MAE']),
        'random_state': trial.suggest_categorical('random_state', [random_state])
    }

    if cat_params['bootstrap_type'] == 'Bayesian':
        cat_params['bagging_temperature'] = trial.suggest_float(
            'bagging_temperature', 0, 100)
    elif cat_params['bootstrap_type'] == 'Bernoulli':
        cat_params['subsample'] = trial.suggest_float(
            'subsample', 0.1, 1, log=True)

    cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostRegressor(**cat_params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [30]:
study_cat = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.SuccessiveHalvingPruner(),
    study_name='Cat_00')
func = lambda trial: objective_cat(trial,
                                   X_train_ct,
                                   y_train_ct,
                                   N_FOLDS=N_FOLDS,
                                   random_state=RAND,
                                   cat_feat=cat_features)
study_cat.optimize(func, n_trials=2, show_progress_bar=True)

  0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
# смотрим на параметры
study_cat.best_params

{'n_estimators': 1148,
 'learning_rate': 0.03993043117456255,
 'max_depth': 8,
 'colsample_bylevel': 0.7825642278921418,
 'l2_leaf_reg': 11.435953759453188,
 'random_strength': 33.22518563290565,
 'bootstrap_type': 'No',
 'border_count': 128,
 'grow_policy': 'Lossguide',
 'od_wait': 739,
 'leaf_estimation_iterations': 2,
 'loss_function': 'MAE',
 'use_best_model': True,
 'eval_metric': 'MAE',
 'random_state': 10}

In [92]:
cb_opt = CatBoostRegressor(**study_cat.best_params)
cb_opt.fit(X_ct,
           y_ct,
           cat_features=cat_features,
           eval_set=eval_ct,
           verbose=False,
           early_stopping_rounds=100)

y_pred = cb_opt.predict(X_test_ct)
y_pred_exp = np.exp(y_pred) - 1



metrics = metrics.append(
    get_metrics_regression(y_test=y_test_ct_exp,
                           y_pred=y_pred_exp,
                           X_test=X_test_ct,
                           name='CatBoost_Optuna'))
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
0,LinearRegression_baseline,8.347072,156.85872,12.524325,0.455343,0.451267,-inf,inf,34.324438
0,DecisonTreeRegressor_baseline,10.133774,228.179589,15.105614,0.607437,0.201768,-inf,inf,41.671629
0,RandomForestRegressor_baseline,8.258138,176.019952,13.267251,0.476677,0.384236,-inf,inf,33.958726
0,XGBoost_baseline,7.173608,121.679612,11.030848,0.398242,0.574333,-inf,inf,29.498972
0,LightGBM_baseline,7.120787,122.711309,11.077514,0.393043,0.573203,-inf,inf,29.304653
0,CatBoost_baseline,6.453979,99.260909,9.962977,0.359331,0.654765,-inf,inf,26.560497
0,LightGBM_Optuna,6.44347,102.503548,10.124404,0.364783,0.643487,-inf,inf,26.517247
0,CatBoost_Optuna,6.445009,97.164593,9.85721,0.366083,0.662056,-inf,inf,26.523582


In [93]:
check_overfitting(cb_opt,
                  X_ct,
                  y_ct_exp,
                  X_test_ct,
                  y_test_ct_exp)

Mean absolute error train: 5.859
Mean absolute error test: 6.445
delta = 9.1 %


- тюнинг CatBoost привел к чуть более плохой метрике
- но зато переобучение значительно ниже

# Stacking

Возьмем комбинацию нескольких LightGBM и CatBoost, так как среди сложных моделей в них самые низкие метрики

## LGBM tune 1

In [41]:
def objective_lgbm(trial, X, y, N_FOLDS, random_state=10):
    lgb_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'n_estimators': trial.suggest_categorical('n_estimators', [2983]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.08256755427823]),        
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=20),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        # борьба с переобучением
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 100),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 100),
        'min_split_gain': trial.suggest_int('min_split_gain', 0, 20),
        # доля объектов при обучении в дереве
        'subsample': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'subsample_freq': trial.suggest_categorical('bagging_freq', [1]),
        # доля признаков при обучении в дереве
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        # константы
        'objective': trial.suggest_categorical('objective', ['mae']),
        'random_state': trial.suggest_categorical('random_state', [random_state])
    }

    cv = KFold(n_splits=N_FOLDS, shuffle=True)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
           trial, metric='l1')
        model = LGBMRegressor(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric='mae',
                  callbacks=[pruning_callback],
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [42]:
study_lgbm_01 = optuna.create_study(direction='minimize', study_name='LGB_01')
func = lambda trial: objective_lgbm(
    trial, X_train_ct, y_train_ct, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm_01.optimize(func, n_trials=5, show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [94]:
# смотрим на параметры
study_lgbm_01.best_params

{'n_estimators': 2983,
 'learning_rate': 0.08256755427823,
 'num_leaves': 420,
 'max_depth': 13,
 'reg_alpha': 20,
 'reg_lambda': 45,
 'min_split_gain': 20,
 'bagging_fraction': 0.9763861100885414,
 'bagging_freq': 1,
 'colsample_bytree': 0.9089776601002373,
 'objective': 'mae',
 'random_state': 10}

In [95]:
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()
    
pred_val = []
    
folds = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)

for fold, (train_index, test_index) in enumerate(folds.split(X_train_ct, y_train_ct)):
    X_train_, X_val = X_train_ct.iloc[train_index], X_train_ct.iloc[test_index]
    y_train_, y_val = y_train_ct.iloc[train_index], y_train_ct.iloc[test_index]
    
    y_val_exp = np.exp(y_val) - 1

    model = LGBMRegressor(**study_lgbm_01.best_params)

    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric='mae',
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_pred_val_exp = np.exp(y_pred_val) - 1

    print('Fold:', fold + 1,
          'MAE SCORE %.3f' % mean_absolute_error(y_val_exp, y_pred_val_exp))
    print('---')

    pred_val.append(y_pred_val_exp)
    
model.fit(X_ct,
          y_ct,
          eval_set=eval_ct,
          eval_metric='mae',
          early_stopping_rounds=100,
          verbose=0)

meta_X['lgbm_01'] = np.concatenate(pred_val)
meta_X_test['lgbm_01'] = np.exp(model.predict(X_test_ct)) - 1

Fold: 1 MAE SCORE 6.906
---
Fold: 2 MAE SCORE 6.852
---
Fold: 3 MAE SCORE 6.845
---
Fold: 4 MAE SCORE 6.992
---


## LGBM tune 2

In [53]:
def objective_lgbm(trial, X, y, N_FOLDS, random_state=10):
    lgb_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'n_estimators': trial.suggest_categorical('n_estimators', [3902]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03396563179672953]),        
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=20),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        # борьба с переобучением
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 100),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 100),
        'min_split_gain': trial.suggest_int('min_split_gain', 0, 20),
        # доля объектов при обучении в дереве
        'subsample': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'subsample_freq': trial.suggest_categorical('bagging_freq', [1]),
        # доля признаков при обучении в дереве
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        # константы
        'objective': trial.suggest_categorical('objective', ['mae']),
        'random_state': trial.suggest_categorical('random_state', [random_state])
    }

    cv = KFold(n_splits=N_FOLDS, shuffle=True)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
           trial, metric='l1')
        model = LGBMRegressor(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric='mae',
                  callbacks=[pruning_callback],
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [54]:
study_lgbm_02 = optuna.create_study(direction='minimize', study_name='LGB_02')
func = lambda trial: objective_lgbm(
    trial, X_train_ct, y_train_ct, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm_02.optimize(func, n_trials=5, show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [96]:
# смотрим на параметры
study_lgbm_02.best_params

{'n_estimators': 3902,
 'learning_rate': 0.03396563179672953,
 'num_leaves': 620,
 'max_depth': 11,
 'reg_alpha': 14,
 'reg_lambda': 99,
 'min_split_gain': 4,
 'bagging_fraction': 0.9669453988823131,
 'bagging_freq': 1,
 'colsample_bytree': 0.49260769967158996,
 'objective': 'mae',
 'random_state': 10}

In [97]:
pred_val = []
    
folds = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)

for fold, (train_index, test_index) in enumerate(folds.split(X_train_ct, y_train_ct)):
    X_train_, X_val = X_train_ct.iloc[train_index], X_train_ct.iloc[test_index]
    y_train_, y_val = y_train_ct.iloc[train_index], y_train_ct.iloc[test_index]
    
    y_val_exp = np.exp(y_val) - 1

    model = LGBMRegressor(**study_lgbm_02.best_params)

    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric='mae',
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_pred_val_exp = np.exp(y_pred_val) - 1

    print('Fold:', fold + 1,
          'MAE SCORE %.3f' % mean_absolute_error(y_val_exp, y_pred_val_exp))
    print('---')

    pred_val.append(y_pred_val_exp)
    
model.fit(X_ct,
          y_ct,
          eval_set=eval_ct,
          eval_metric='mae',
          early_stopping_rounds=100,
          verbose=0)

meta_X['lgbm_02'] = np.concatenate(pred_val)
meta_X_test['lgbm_02'] = np.exp(model.predict(X_test_ct)) - 1

Fold: 1 MAE SCORE 6.595
---
Fold: 2 MAE SCORE 6.542
---
Fold: 3 MAE SCORE 6.562
---
Fold: 4 MAE SCORE 6.675
---


## LGBM tune 3

In [65]:
def objective_lgbm(trial, X, y, N_FOLDS, random_state=10):
    lgb_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'n_estimators': trial.suggest_categorical('n_estimators', [2482]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.05380382705463601]),        
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=20),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        # борьба с переобучением
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 100),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 100),
        'min_split_gain': trial.suggest_int('min_split_gain', 0, 20),
        # доля объектов при обучении в дереве
        'subsample': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'subsample_freq': trial.suggest_categorical('bagging_freq', [1]),
        # доля признаков при обучении в дереве
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        # константы
        'objective': trial.suggest_categorical('objective', ['mae']),
        'random_state': trial.suggest_categorical('random_state', [random_state])
    }

    cv = KFold(n_splits=N_FOLDS, shuffle=True)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
           trial, metric='l1')
        model = LGBMRegressor(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric='mae',
                  callbacks=[pruning_callback],
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [66]:
study_lgbm_03 = optuna.create_study(direction='minimize', study_name='LGB_03')
func = lambda trial: objective_lgbm(
    trial, X_train_ct, y_train_ct, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm_03.optimize(func, n_trials=5, show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [98]:
# смотрим на параметры
study_lgbm_03.best_params

{'n_estimators': 2482,
 'learning_rate': 0.05380382705463601,
 'num_leaves': 700,
 'max_depth': 15,
 'reg_alpha': 10,
 'reg_lambda': 96,
 'min_split_gain': 1,
 'bagging_fraction': 0.9693332848337597,
 'bagging_freq': 1,
 'colsample_bytree': 0.5004742141096672,
 'objective': 'mae',
 'random_state': 10}

In [99]:
pred_val = []
    
folds = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)

for fold, (train_index, test_index) in enumerate(folds.split(X_train_ct, y_train_ct)):
    X_train_, X_val = X_train_ct.iloc[train_index], X_train_ct.iloc[test_index]
    y_train_, y_val = y_train_ct.iloc[train_index], y_train_ct.iloc[test_index]
    
    y_val_exp = np.exp(y_val) - 1

    model = LGBMRegressor(**study_lgbm_03.best_params)

    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric='mae',
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_pred_val_exp = np.exp(y_pred_val) - 1

    print('Fold:', fold + 1,
          'MAE SCORE %.3f' % mean_absolute_error(y_val_exp, y_pred_val_exp))
    print('---')

    pred_val.append(y_pred_val_exp)
    
model.fit(X_ct,
          y_ct,
          eval_set=eval_ct,
          eval_metric='mae',
          early_stopping_rounds=100,
          verbose=0)

meta_X['lgbm_03'] = np.concatenate(pred_val)
meta_X_test['lgbm_03'] = np.exp(model.predict(X_test_ct)) - 1

Fold: 1 MAE SCORE 6.236
---
Fold: 2 MAE SCORE 6.194
---
Fold: 3 MAE SCORE 6.215
---
Fold: 4 MAE SCORE 6.306
---


## CatBoost from tunining

In [100]:
study_cat.best_params

{'n_estimators': 1148,
 'learning_rate': 0.03993043117456255,
 'max_depth': 8,
 'colsample_bylevel': 0.7825642278921418,
 'l2_leaf_reg': 11.435953759453188,
 'random_strength': 33.22518563290565,
 'bootstrap_type': 'No',
 'border_count': 128,
 'grow_policy': 'Lossguide',
 'od_wait': 739,
 'leaf_estimation_iterations': 2,
 'loss_function': 'MAE',
 'use_best_model': True,
 'eval_metric': 'MAE',
 'random_state': 10}

In [101]:
pred_val = []
    
folds = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)

for fold, (train_index, test_index) in enumerate(folds.split(X_train_ct, y_train_ct)):
    X_train_, X_val = X_train_ct.iloc[train_index], X_train_ct.iloc[test_index]
    y_train_, y_val = y_train_ct.iloc[train_index], y_train_ct.iloc[test_index]
    
    y_val_exp = np.exp(y_val) - 1

    model = CatBoostRegressor(**study_cat.best_params)

    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              cat_features=cat_features,
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_pred_val_exp = np.exp(y_pred_val) - 1

    print('Fold:', fold + 1,
          'MAE SCORE %.3f' % mean_absolute_error(y_val_exp, y_pred_val_exp))
    print('---')

    pred_val.append(y_pred_val_exp)
    
model.fit(X_ct,
          y_ct,
          cat_features=cat_features,
          eval_set=eval_ct,
          verbose=False,
          early_stopping_rounds=100)

meta_X['cb_tune'] = np.concatenate(pred_val)
meta_X_test['cb_tune'] = np.exp(model.predict(X_test_ct)) - 1

Fold: 1 MAE SCORE 6.277
---
Fold: 2 MAE SCORE 6.248
---
Fold: 3 MAE SCORE 6.266
---
Fold: 4 MAE SCORE 6.349
---


In [102]:
meta_X[:5]

Unnamed: 0,lgbm_01,lgbm_02,lgbm_03,cb_tune
0,24.718809,24.115703,22.475911,22.874065
1,20.907353,19.620355,19.83316,20.026198
2,18.943912,19.535152,18.498824,18.630465
3,20.065007,19.76857,18.409541,17.096548
4,9.417453,9.867613,9.883602,8.729491


In [103]:
meta_X_test[:5]

Unnamed: 0,lgbm_01,lgbm_02,lgbm_03,cb_tune
0,8.245693,7.571461,7.665905,7.064705
1,68.537094,69.968293,67.352959,73.142172
2,9.382781,11.658344,11.257861,12.236397
3,17.425789,17.956439,16.970769,18.792646
4,25.455602,24.24346,24.436538,33.370898


## Final meta model

In [104]:
stack_model = LinearRegression()
stack_model.fit(meta_X, y_train_ct)

In [105]:
y_pred = stack_model.predict(meta_X_test)
y_pred_exp = np.exp(y_pred) - 1

metrics = metrics.append(
    get_metrics_regression(y_test_ct_exp, y_pred_exp, X_test_ct,
                           name='Stacking_hand_tune'))
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
0,LinearRegression_baseline,8.347072,156.85872,12.524325,0.455343,0.451267,-inf,inf,34.324438
0,DecisonTreeRegressor_baseline,10.133774,228.179589,15.105614,0.607437,0.201768,-inf,inf,41.671629
0,RandomForestRegressor_baseline,8.258138,176.019952,13.267251,0.476677,0.384236,-inf,inf,33.958726
0,XGBoost_baseline,7.173608,121.679612,11.030848,0.398242,0.574333,-inf,inf,29.498972
0,LightGBM_baseline,7.120787,122.711309,11.077514,0.393043,0.573203,-inf,inf,29.304653
0,CatBoost_baseline,6.453979,99.260909,9.962977,0.359331,0.654765,-inf,inf,26.560497
0,LightGBM_Optuna,6.44347,102.503548,10.124404,0.364783,0.643487,-inf,inf,26.517247
0,CatBoost_Optuna,6.445009,97.164593,9.85721,0.366083,0.662056,-inf,inf,26.523582
0,Stacking_hand_tune,12.107403,310.438572,17.619267,0.653396,-0.079723,-inf,inf,49.82641


In [106]:
check_overfitting(stack_model,
                  meta_X,
                  y_train_ct_exp,
                  meta_X_test,
                  y_test_ct_exp)

Mean absolute error train: 12.033
Mean absolute error test: 12.107
delta = 0.6 %


- MAE заметно ухудшилась
- однако переобучение минимальное, что особенно важно для градиентных бустингов

In [108]:
# сохраним итоговую таблицу с метриками
metrics.to_csv(r'C:\Users\main6\OneDrive\Документы\jupyter\Pet_pro\data\final_metrics.csv',
               index=False)

In [110]:
# сохраним переменные для анализа важных признаков в конце
with open (r'C:\Users\main6\OneDrive\Документы\jupyter\Pet_pro\models\test_ct.pkl', 
           'wb') as f:
    pickle.dump((X_test_ct, y_test_ct), f)

with open(r'C:\Users\main6\OneDrive\Документы\jupyter\Pet_pro\models\models.pkl',
          'wb') as f:
    pickle.dump((lgbm_opt, cb_opt), f)

#### Общие выводы:

- для поиска наилучших гиперпараметров была использована библиотека Optunf
- в результате настройки были получены очень низкие показатели MAE
- стэкинг позваляет не переобучаться
- лучший результат на LightGBM