In [82]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import os

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import missingno as msno
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import shap
from typing import List, Dict

import pickle

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from scipy.stats import gmean, rankdata

In [2]:
pd.options.display.max_rows = 222
pd.set_option("display.max_columns", 50)

In [52]:
path_applications_history = "../geekbrains-competitive-data-analysis/applications_history.csv"
path_bki = "../geekbrains-competitive-data-analysis/bki.csv"
path_payments = "../geekbrains-competitive-data-analysis/payments.csv"
path_client_profile = "../geekbrains-competitive-data-analysis/client_profile.csv"
path_train = "../geekbrains-competitive-data-analysis/train.csv"
path_test = "../geekbrains-competitive-data-analysis/test.csv"

TARGET = 'TARGET'

In [4]:
def create_freq_feature(data: pd.DataFrame,
                        feature: str
                       ) -> pd.DataFrame:
    freq = data[feature].value_counts()
    data[feature] = data[feature].map(freq).astype('float')
    data[feature] = data[feature].fillna(0.0).astype('float')
    data[feature] = data[feature] / data.shape[0]
    
    return data

In [5]:
# applications_history

def preprocessing_applications_history(data: pd.DataFrame, 
                                       copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---NAME_CONTRACT_TYPE---
    data['NAME_CONTRACT_TYPE'] = (data['NAME_CONTRACT_TYPE'] == 'Cash').astype(int)
    
    # ---AMOUNT_ANNUITY---
    # ---AMOUNT_GOODS_PAYMENT---
    data = data.fillna(value=
                       {'AMOUNT_ANNUITY': 0, 
                        'AMOUNT_GOODS_PAYMENT': 0})
    
    # ---AMOUNT_PAYMENT---
    data.drop(['AMOUNT_PAYMENT'], axis='columns', inplace=True)
    
    # ---NAME_TYPE_SUITE---
    data = create_freq_feature(data, 'NAME_TYPE_SUITE')
    
    # ---NAME_CONTRACT_STATUS---
    freq = pd.Series(data=[1, 0, 0.5, 0.5], index=['Approved', 'Canceled', 'Refused', 'Unused offer'], dtype='float')
    data['NAME_CONTRACT_STATUS'] = data['NAME_CONTRACT_STATUS'].map(freq).astype('float')
    
    # ---NAME_PAYMENT_TYPE---
    data = create_freq_feature(data, 'NAME_PAYMENT_TYPE')
    
    # ---CODE_REJECT_REASON---
    data = create_freq_feature(data, 'CODE_REJECT_REASON')
    
    # ---NAME_CLIENT_TYPE---
    data = create_freq_feature(data, 'NAME_CLIENT_TYPE')
    
    # ---NAME_GOODS_CATEGORY---
    data = create_freq_feature(data, 'NAME_GOODS_CATEGORY')
    
    # ---NAME_PORTFOLIO---
    # ---NAME_PRODUCT_TYPE---
    # ---NAME_YIELD_GROUP---
    data = create_freq_feature(data, 'NAME_PORTFOLIO')
    data = create_freq_feature(data, 'NAME_PRODUCT_TYPE')
    data = create_freq_feature(data, 'NAME_YIELD_GROUP')

    # ---another---
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [6]:
# bki

def preprocessing_bki(data: pd.DataFrame, 
                      copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---CREDIT_ACTIVE---
    # ---CREDIT_CURRENCY---
    # ---CREDIT_TYPE---
    data = create_freq_feature(data, 'CREDIT_ACTIVE')
    data = create_freq_feature(data, 'CREDIT_CURRENCY')
    data = create_freq_feature(data, 'CREDIT_TYPE')
    
    # ---DAYS_CREDIT_ENDDATE---
    mean = data.loc[data['DAYS_CREDIT_ENDDATE'].notnull(), ['DAYS_CREDIT_ENDDATE']].mean()
    data['DAYS_CREDIT_ENDDATE'] = data['DAYS_CREDIT_ENDDATE'].fillna(mean[0])
    
    # ---DAYS_ENDDATE_FACT---
    data.loc[data['DAYS_ENDDATE_FACT'].isnull(), ['DAYS_ENDDATE_FACT']] = data['DAYS_CREDIT_ENDDATE']
    
    # ---AMT_CREDIT_MAX_OVERDUE---
    data['AMT_CREDIT_MAX_OVERDUE'] = data['AMT_CREDIT_MAX_OVERDUE'].fillna(0)
    
    # ---another---
    data.drop(['AMT_ANNUITY'], axis='columns', inplace=True)
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [24]:
# payments

def preprocessing_payments(data: pd.DataFrame, 
                           copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
        
    data.drop('PREV_APPLICATION_NUMBER', axis='columns', inplace=True)
    
    # ---another---
    data = data.fillna(0)    
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.median)
    
    return data

In [28]:
# client_profile

def preprocessing_client_profile(data: pd.DataFrame, 
                                 copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---GENDER---
    data['GENDER'] = (data['GENDER'] == 'F').astype(int)
    
    # ---FAMILY_STATUS---
    # ---EDUCATION_LEVEL---
    data = create_freq_feature(data, 'FAMILY_STATUS')
    data = create_freq_feature(data, 'EDUCATION_LEVEL')
    
    
    # ---OWN_CAR_AGE---
    # ---EXTERNAL_SCORING_RATING_1---
    # ---EXTERNAL_SCORING_RATING_3---
    data = data.fillna(value=
                       {'OWN_CAR_AGE': 0, 
                        'EXTERNAL_SCORING_RATING_1': data['EXTERNAL_SCORING_RATING_1'].min(),
                        'EXTERNAL_SCORING_RATING_3': data['EXTERNAL_SCORING_RATING_3'].min()
                       })   

    # ---another---
    data = data.fillna(0)
    
    return data

In [29]:
# train
# test

def preprocessing_train(data: pd.DataFrame, 
                        copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---NAME_CONTRACT_TYPE---
    data['NAME_CONTRACT_TYPE'] = (data['NAME_CONTRACT_TYPE'] == 'Cash').astype(int)
    
    return data

In [31]:
def data_merege(data: pd.DataFrame, 
                copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    data = data.merge(df_client_profile, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_applications_history, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_bki, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_payments, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    return data

In [34]:
#read data

df_client_profile = pd.read_csv(path_client_profile)
df_applications_history = pd.read_csv(path_applications_history)
df_bki = pd.read_csv(path_bki)
df_payments = pd.read_csv(path_payments)
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

df_client_profile = preprocessing_client_profile(df_client_profile)
df_applications_history = preprocessing_applications_history(df_applications_history)
df_bki = preprocessing_bki(df_bki)
df_payments = preprocessing_payments(df_payments)

df_train = preprocessing_train(df_train)
df_test = preprocessing_train(df_test)

df_train = data_merege(df_train)
df_test = data_merege(df_test)

In [85]:
def selection_best_feature(data: pd.DataFrame,
                           path: str = 'best_feature.str',
                           plot_bar: bool = False,
                           plot_dot: bool = False) -> List:
    
    if os.path.exists(path):
        print('load save feature')
        with open(path, 'rb') as f:
            return pickle.load(f)
        
    best_feature = list()    
    
    FEATURE = data.columns
    FEATURE = FEATURE.drop(TARGET)
    FEATURE = FEATURE.drop('APPLICATION_NUMBER')
    
    df_train, df_valid = train_test_split(data, 
                                      test_size=0.3,  
                                      random_state=42)
    
    dtrain = xgb.DMatrix(data=data[FEATURE], 
                         label=data[TARGET])
    print('*')

    params = {
        "booster": "gbtree",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.01,
        "reg_lambda": 100,
        "max_depth": 10,
        "gamma": 10,
        "nthread": 6,
        "seed": 27
    }

    model_xgb = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        evals=[(dtrain, "train")],
        verbose_eval=200,
        maximize=True,
    )
    
    x_valid_ = df_valid.sample(5000)
    y_valid_ =  x_valid_[TARGET]
    x_valid_ =  x_valid_[FEATURE]
    explainer = shap.TreeExplainer(model_xgb)
    shap_values = explainer.shap_values(x_valid_, y_valid_)
    
    mean = list()

    for j in range(shap_values.shape[1]):
        bufer = 0
        for i in range(shap_values.shape[0]):
            bufer += np.abs(shap_values[i][j])
        bufer /= shap_values.shape[0]
        mean.append(bufer)

    mean = np.array(mean).reshape(-1, shap_values.shape[1])
    
    if plot_bar:
        shap.summary_plot(shap_values, x_valid_, plot_type="bar", max_display=150)
    
    if plot_dot:
        shap.summary_plot(shap_values, x_valid_, plot_type="dot")

    feature_importances = pd.DataFrame(mean, columns=x_valid_.columns, index=['importances'])
    feature_importances = feature_importances.sort_values('importances', axis=1, ascending=0)
    max_num_feature = feature_importances.loc['importances', feature_importances.loc['importances',:]>0].shape[0]
    best_feature = feature_importances.iloc[: ,0:max_num_feature].columns
    
    with open(path, 'wb') as f:
        pickle.dump(best_feature, f)
    
    return best_feature

In [87]:
best_feature = selection_best_feature(df_train)

*
[0]	train-auc:0.54260
Will train until train-auc hasn't improved in 50 rounds.
[200]	train-auc:0.69313
[400]	train-auc:0.71976
[499]	train-auc:0.72493


In [81]:
def search_best_parameters_xgb(data: pd.DataFrame,
                               path: str = 'best_parameters_xgb.prm'
                              ) -> Dict:
    
    if os.path.exists(path):
        print('load save parameters')
        with open(path, 'rb') as f:
            return pickle.load(f)
    
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.3,  
                                          random_state=42)
    df_test, df_valid = train_test_split(df_valid, 
                                         test_size=0.5,  
                                         random_state=42)

    dtrain = xgb.DMatrix(data=df_train[best_feature], 
                         label=df_train[TARGET])
    dvalid = xgb.DMatrix(data=df_valid[best_feature], 
                         label=df_valid[TARGET])
    dtest = xgb.DMatrix(data=df_test[best_feature])

    def hyperopt_xgb_score(params):
        model_xgb = xgb.train(
                              params=params,
                              dtrain=dtrain,
                              num_boost_round=1000,
                              early_stopping_rounds=50,
                              evals=[(dvalid, "valid")],
                              verbose_eval=200,
                              maximize=True,                                     
                             )
        
        current_score = roc_auc_score(df_test[TARGET], model_xgb.predict(dtest))
#         print(current_score, params)
        return -current_score


    parameters_xgb = {
                      #'n_estimators': hp.choice('n_estimators', range(100, 1000, 100)),
                      'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
                      'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
                      'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
                      'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                      'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
                      'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
                      'eval_metric': 'auc',
                      'objective': 'binary:logistic',
                      'nthread': 4,
                      'booster': 'gbtree',
                      'tree_method': 'exact'
                     }

    best = fmin(fn=hyperopt_xgb_score, space=parameters_xgb, algo=tpe.suggest, max_evals=10)
#     print('best:')
#     print(best)
    
    with open(path, 'wb') as f:
        pickle.dump(best, f)
    
    return best
    
best_parameters_xgb = search_best_parameters_xgb(df_train)
best_parameters_xgb

load save parameters


{'colsample_bytree': 0.7000000000000001,
 'eta': 0.1,
 'gamma': 0.9,
 'max_depth': 7,
 'min_child_weight': 3.0,
 'subsample': 1.0}

In [102]:
def search_best_parameters_lgb(data: pd.DataFrame,
                               path: str = 'best_parameters_lgb.prm'
                              ) -> Dict:
    
    if os.path.exists(path):
        print('load save parameters')
        with open(path, 'rb') as f:
            return pickle.load(f)
    
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.3,  
                                          random_state=42)
    df_test, df_valid = train_test_split(df_valid, 
                                         test_size=0.5,  
                                         random_state=42)

    dtrain = lgb.Dataset(data=df_train[best_feature], 
                     label=df_train[TARGET])
    dvalid = lgb.Dataset(data=df_valid[best_feature], 
                     label=df_valid[TARGET])

    def hyperopt_lgb_score(params):

        model_lgb = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dvalid],
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        current_score = roc_auc_score(df_test[TARGET], model_lgb.predict(df_test[best_feature]))
#         print(current_score, params)
        return -current_score


    parameters_lgb = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     10000,
    "metric":           "auc",
    }        

    best = fmin(fn=hyperopt_lgb_score, space=parameters_lgb, algo=tpe.suggest, max_evals=10)
#     print('best:')
#     print(best)
    
    with open(path, 'wb') as f:
        pickle.dump(best, f)
    
    return best
    
best_parameters_lgb = search_best_parameters_lgb(df_train)
best_parameters_lgb

load save parameters


{'colsample_bytree': 3,
 'learning_rate': 1,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 0.8997533993881798}

In [108]:
def search_best_parameters_cb(data: pd.DataFrame,
                               path: str = 'best_parameters_cb.prm'
                              ) -> Dict:
    
    if os.path.exists(path):
        print('load save parameters')
        with open(path, 'rb') as f:
            return pickle.load(f)
    
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.3,  
                                          random_state=42)
    df_test, df_valid = train_test_split(df_valid, 
                                         test_size=0.5,  
                                         random_state=42)

    def hyperopt_cb_score(params):   


        model_cb = cb.CatBoostClassifier(**params)
        model_cb.fit(df_train[best_feature], 
                     df_train[TARGET], 
                     eval_set=[(df_train[best_feature], df_train[TARGET]), 
                               (df_valid[best_feature], df_valid[TARGET])]
                     )
        
        
        current_score = roc_auc_score(df_test[TARGET], model_cb.predict_proba(df_test[best_feature])[:, 1])
#         print(current_score, params)
        return -current_score


    parameters_cb = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':      100,
    'eval_metric':       'AUC',
    'loss_function':     'Logloss',
    'early_stopping_rounds':         50, 
    }   

    best = fmin(fn=hyperopt_cb_score, space=parameters_cb, algo=tpe.suggest, max_evals=10)
#     print('best:')
#     print(best)
    
    with open(path, 'wb') as f:
        pickle.dump(best, f)
    
    return best
    
best_parameters_cb = search_best_parameters_cb(df_train)
best_parameters_cb

0:	test: 0.5882926	test1: 0.5504845	best: 0.5504845 (0)	total: 222ms	remaining: 22s

1:	test: 0.6780218	test1: 0.6524058	best: 0.6524058 (1)	total: 303ms	remaining: 14.9s

2:	test: 0.6856655	test1: 0.6556492	best: 0.6556492 (2)	total: 350ms	remaining: 11.3s

3:	test: 0.6925210	test1: 0.6543494	best: 0.6556492 (2)	total: 425ms	remaining: 10.2s

4:	test: 0.7117320	test1: 0.6800996	best: 0.6800996 (4)	total: 507ms	remaining: 9.63s

5:	test: 0.7139230	test1: 0.6794066	best: 0.6800996 (4)	total: 593ms	remaining: 9.29s

6:	test: 0.7214302	test1: 0.6898702	best: 0.6898702 (6)	total: 673ms	remaining: 8.94s

7:	test: 0.7224150	test1: 0.6930733	best: 0.6930733 (7)	total: 773ms	remaining: 8.89s

8:	test: 0.7298003	test1: 0.6986510	best: 0.6986510 (8)	total: 840ms	remaining: 8.49s

9:	test: 0.7343870	test1: 0.7047407	best: 0.7047407 (9)	total: 921ms	remaining: 8.28s

10:	test: 0.7348852	test1: 0.7053033	best: 0.7053033 (10)	total: 1.01s	remaining: 8.22s

11:	test: 0.7373467	test1: 0.7057935	best: 

21:	test: 0.7448667	test1: 0.7172576	best: 0.7172576 (21)	total: 1.09s	remaining: 3.85s

22:	test: 0.7453430	test1: 0.7166945	best: 0.7172576 (21)	total: 1.13s	remaining: 3.8s

23:	test: 0.7477100	test1: 0.7169205	best: 0.7172576 (21)	total: 1.18s	remaining: 3.73s

24:	test: 0.7485041	test1: 0.7178716	best: 0.7178716 (24)	total: 1.23s	remaining: 3.67s

25:	test: 0.7502692	test1: 0.7182561	best: 0.7182561 (25)	total: 1.27s	remaining: 3.6s

26:	test: 0.7518114	test1: 0.7177420	best: 0.7182561 (25)	total: 1.31s	remaining: 3.54s

27:	test: 0.7533421	test1: 0.7182703	best: 0.7182703 (27)	total: 1.35s	remaining: 3.48s

28:	test: 0.7543393	test1: 0.7173039	best: 0.7182703 (27)	total: 1.4s	remaining: 3.42s

29:	test: 0.7556469	test1: 0.7179700	best: 0.7182703 (27)	total: 1.44s	remaining: 3.35s

30:	test: 0.7567997	test1: 0.7189052	best: 0.7189052 (30)	total: 1.48s	remaining: 3.29s

31:	test: 0.7579840	test1: 0.7187188	best: 0.7189052 (30)	total: 1.52s	remaining: 3.23s

32:	test: 0.7593168	test

22:	test: 0.7338730	test1: 0.7144357	best: 0.7144357 (22)	total: 861ms	remaining: 2.88s

23:	test: 0.7354035	test1: 0.7152461	best: 0.7152461 (23)	total: 905ms	remaining: 2.86s

24:	test: 0.7366409	test1: 0.7153095	best: 0.7153095 (24)	total: 949ms	remaining: 2.85s

25:	test: 0.7376320	test1: 0.7156556	best: 0.7156556 (25)	total: 993ms	remaining: 2.83s

26:	test: 0.7396807	test1: 0.7151220	best: 0.7156556 (25)	total: 1.03s	remaining: 2.79s

27:	test: 0.7409692	test1: 0.7156106	best: 0.7156556 (25)	total: 1.07s	remaining: 2.75s

28:	test: 0.7427111	test1: 0.7153277	best: 0.7156556 (25)	total: 1.1s	remaining: 2.71s

29:	test: 0.7440901	test1: 0.7164997	best: 0.7164997 (29)	total: 1.14s	remaining: 2.67s

30:	test: 0.7455927	test1: 0.7163539	best: 0.7164997 (29)	total: 1.18s	remaining: 2.63s

31:	test: 0.7466210	test1: 0.7167920	best: 0.7167920 (31)	total: 1.22s	remaining: 2.59s

32:	test: 0.7473192	test1: 0.7177726	best: 0.7177726 (32)	total: 1.26s	remaining: 2.56s

33:	test: 0.7483791	te

20:	test: 0.7492267	test1: 0.7055440	best: 0.7055440 (20)	total: 2.77s	remaining: 10.4s

21:	test: 0.7516517	test1: 0.7055676	best: 0.7055676 (21)	total: 2.93s	remaining: 10.4s

22:	test: 0.7535864	test1: 0.7055848	best: 0.7055848 (22)	total: 3.08s	remaining: 10.3s

23:	test: 0.7543898	test1: 0.7069596	best: 0.7069596 (23)	total: 3.22s	remaining: 10.2s

24:	test: 0.7552876	test1: 0.7063313	best: 0.7069596 (23)	total: 3.37s	remaining: 10.1s

25:	test: 0.7540792	test1: 0.7063773	best: 0.7069596 (23)	total: 3.51s	remaining: 9.98s

26:	test: 0.7550097	test1: 0.7071907	best: 0.7071907 (26)	total: 3.64s	remaining: 9.85s

27:	test: 0.7549561	test1: 0.7077978	best: 0.7077978 (27)	total: 3.78s	remaining: 9.73s

28:	test: 0.7553717	test1: 0.7095480	best: 0.7095480 (28)	total: 3.91s	remaining: 9.58s

29:	test: 0.7557186	test1: 0.7106537	best: 0.7106537 (29)	total: 4.05s	remaining: 9.44s

30:	test: 0.7570621	test1: 0.7104515	best: 0.7106537 (29)	total: 4.19s	remaining: 9.32s

31:	test: 0.7581380	t

9:	test: 0.8735177	test1: 0.6832899	best: 0.6832899 (9)	total: 16.9s	remaining: 2m 31s

10:	test: 0.8896700	test1: 0.6871231	best: 0.6871231 (10)	total: 19s	remaining: 2m 34s

11:	test: 0.8978524	test1: 0.6871504	best: 0.6871504 (11)	total: 20.9s	remaining: 2m 32s

12:	test: 0.9096960	test1: 0.6853047	best: 0.6871504 (11)	total: 22.7s	remaining: 2m 32s

13:	test: 0.9220882	test1: 0.6837798	best: 0.6871504 (11)	total: 24.5s	remaining: 2m 30s

14:	test: 0.9333266	test1: 0.6824269	best: 0.6871504 (11)	total: 26.2s	remaining: 2m 28s

15:	test: 0.9379448	test1: 0.6827736	best: 0.6871504 (11)	total: 28.1s	remaining: 2m 27s

16:	test: 0.9439928	test1: 0.6826959	best: 0.6871504 (11)	total: 29.8s	remaining: 2m 25s

17:	test: 0.9476742	test1: 0.6843492	best: 0.6871504 (11)	total: 31.5s	remaining: 2m 23s

18:	test: 0.9512533	test1: 0.6837107	best: 0.6871504 (11)	total: 33.3s	remaining: 2m 21s

19:	test: 0.9563763	test1: 0.6826588	best: 0.6871504 (11)	total: 35.1s	remaining: 2m 20s

20:	test: 0.96

35:	test: 0.9827949	test1: 0.6735516	best: 0.6985743 (9)	total: 34.6s	remaining: 1m 1s

36:	test: 0.9845931	test1: 0.6722863	best: 0.6985743 (9)	total: 35.6s	remaining: 1m

37:	test: 0.9864889	test1: 0.6705872	best: 0.6985743 (9)	total: 36.6s	remaining: 59.7s

38:	test: 0.9876467	test1: 0.6694133	best: 0.6985743 (9)	total: 37.4s	remaining: 58.6s

39:	test: 0.9884920	test1: 0.6691364	best: 0.6985743 (9)	total: 38.4s	remaining: 57.6s

40:	test: 0.9891888	test1: 0.6675249	best: 0.6985743 (9)	total: 39.4s	remaining: 56.7s

41:	test: 0.9898950	test1: 0.6679771	best: 0.6985743 (9)	total: 40.4s	remaining: 55.8s

42:	test: 0.9911766	test1: 0.6663962	best: 0.6985743 (9)	total: 41.5s	remaining: 55s

43:	test: 0.9917383	test1: 0.6664210	best: 0.6985743 (9)	total: 42.4s	remaining: 54s

44:	test: 0.9921883	test1: 0.6670292	best: 0.6985743 (9)	total: 43.3s	remaining: 52.9s

45:	test: 0.9926302	test1: 0.6664883	best: 0.6985743 (9)	total: 44.2s	remaining: 51.9s

46:	test: 0.9930218	test1: 0.6668537	be

63:	test: 0.9966298	test1: 0.6788379	best: 0.6941990 (32)	total: 2m 9s	remaining: 1m 12s

64:	test: 0.9968430	test1: 0.6789481	best: 0.6941990 (32)	total: 2m 11s	remaining: 1m 10s

65:	test: 0.9971924	test1: 0.6785675	best: 0.6941990 (32)	total: 2m 13s	remaining: 1m 8s

66:	test: 0.9973396	test1: 0.6783561	best: 0.6941990 (32)	total: 2m 15s	remaining: 1m 6s

67:	test: 0.9976288	test1: 0.6777123	best: 0.6941990 (32)	total: 2m 17s	remaining: 1m 4s

68:	test: 0.9977040	test1: 0.6774575	best: 0.6941990 (32)	total: 2m 19s	remaining: 1m 2s

69:	test: 0.9977940	test1: 0.6776433	best: 0.6941990 (32)	total: 2m 21s	remaining: 1m

70:	test: 0.9978733	test1: 0.6781398	best: 0.6941990 (32)	total: 2m 24s	remaining: 58.9s

71:	test: 0.9980310	test1: 0.6784341	best: 0.6941990 (32)	total: 2m 26s	remaining: 56.9s

72:	test: 0.9981407	test1: 0.6781933	best: 0.6941990 (32)	total: 2m 28s	remaining: 54.8s

73:	test: 0.9982522	test1: 0.6779455	best: 0.6941990 (32)	total: 2m 30s	remaining: 52.8s

74:	test: 0.

68:	test: 0.7206751	test1: 0.7167345	best: 0.7167345 (68)	total: 2.85s	remaining: 1.28s

69:	test: 0.7208504	test1: 0.7166811	best: 0.7167345 (68)	total: 2.88s	remaining: 1.24s

70:	test: 0.7212428	test1: 0.7170834	best: 0.7170834 (70)	total: 2.92s	remaining: 1.19s

71:	test: 0.7213584	test1: 0.7172290	best: 0.7172290 (71)	total: 2.96s	remaining: 1.15s

72:	test: 0.7217064	test1: 0.7174566	best: 0.7174566 (72)	total: 3s	remaining: 1.11s

73:	test: 0.7221359	test1: 0.7174404	best: 0.7174566 (72)	total: 3.04s	remaining: 1.07s

74:	test: 0.7224146	test1: 0.7175589	best: 0.7175589 (74)	total: 3.08s	remaining: 1.03s

75:	test: 0.7224467	test1: 0.7175216	best: 0.7175589 (74)	total: 3.12s	remaining: 984ms

76:	test: 0.7227492	test1: 0.7175619	best: 0.7175619 (76)	total: 3.15s	remaining: 942ms

77:	test: 0.7231123	test1: 0.7175314	best: 0.7175619 (76)	total: 3.19s	remaining: 899ms

78:	test: 0.7234751	test1: 0.7176608	best: 0.7176608 (78)	total: 3.23s	remaining: 858ms

79:	test: 0.7239094	test

58:	test: 0.9445905	test1: 0.7154329	best: 0.7154329 (58)	total: 1m 38s	remaining: 1m 8s

59:	test: 0.9461067	test1: 0.7156658	best: 0.7156658 (59)	total: 1m 40s	remaining: 1m 6s

60:	test: 0.9477029	test1: 0.7157079	best: 0.7157079 (60)	total: 1m 41s	remaining: 1m 4s

61:	test: 0.9491335	test1: 0.7154735	best: 0.7157079 (60)	total: 1m 43s	remaining: 1m 3s

62:	test: 0.9507058	test1: 0.7156074	best: 0.7157079 (60)	total: 1m 44s	remaining: 1m 1s

63:	test: 0.9522425	test1: 0.7148013	best: 0.7157079 (60)	total: 1m 46s	remaining: 59.8s

64:	test: 0.9536477	test1: 0.7145892	best: 0.7157079 (60)	total: 1m 47s	remaining: 58.1s

65:	test: 0.9543868	test1: 0.7149529	best: 0.7157079 (60)	total: 1m 49s	remaining: 56.3s

66:	test: 0.9557482	test1: 0.7145646	best: 0.7157079 (60)	total: 1m 51s	remaining: 54.7s

67:	test: 0.9561619	test1: 0.7151847	best: 0.7157079 (60)	total: 1m 52s	remaining: 53.1s

68:	test: 0.9571860	test1: 0.7157110	best: 0.7157110 (68)	total: 1m 54s	remaining: 51.4s

69:	test: 

47:	test: 0.9482225	test1: 0.6763503	best: 0.7037942 (7)	total: 9.66s	remaining: 10.5s

48:	test: 0.9500901	test1: 0.6751744	best: 0.7037942 (7)	total: 9.87s	remaining: 10.3s

49:	test: 0.9515916	test1: 0.6755229	best: 0.7037942 (7)	total: 10.1s	remaining: 10.1s

50:	test: 0.9523709	test1: 0.6758862	best: 0.7037942 (7)	total: 10.3s	remaining: 9.85s

51:	test: 0.9530585	test1: 0.6757145	best: 0.7037942 (7)	total: 10.4s	remaining: 9.62s

52:	test: 0.9543880	test1: 0.6749274	best: 0.7037942 (7)	total: 10.6s	remaining: 9.43s

53:	test: 0.9553646	test1: 0.6738368	best: 0.7037942 (7)	total: 10.8s	remaining: 9.21s

54:	test: 0.9562686	test1: 0.6733464	best: 0.7037942 (7)	total: 11s	remaining: 8.98s

55:	test: 0.9578431	test1: 0.6738479	best: 0.7037942 (7)	total: 11.2s	remaining: 8.77s

56:	test: 0.9593635	test1: 0.6746560	best: 0.7037942 (7)	total: 11.4s	remaining: 8.58s

57:	test: 0.9599493	test1: 0.6740300	best: 0.7037942 (7)	total: 11.6s	remaining: 8.36s

Stopped by overfitting detector  (

{'colsample_bylevel': 3, 'learning_rate': 0, 'max_depth': 1}

In [113]:
def train_xgb(data: pd.DataFrame,
              params: Dict):
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.33,  
                                          random_state=42)
    
    dtrain = xgb.DMatrix(data=df_train[best_feature], 
                         label=df_train[TARGET])
    dvalid = xgb.DMatrix(data=df_valid[best_feature], 
                         label=df_valid[TARGET])
    
    model_xgb = xgb.train(
                          params=params,
                          dtrain=dtrain,
                          num_boost_round=1000,
                          early_stopping_rounds=50,
                          evals=[(dvalid, "valid")],
                          verbose_eval=200,
                          maximize=True,                                     
                         )
    
    return model_xgb

model_xgb = train_xgb(df_train, best_parameters_xgb)

[0]	valid-rmse:0.46492
Will train until valid-rmse hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	valid-rmse:0.46492



In [115]:
def train_lgb(data: pd.DataFrame,
              params: Dict):
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.33,  
                                          random_state=42)
    
    dtrain = lgb.Dataset(data=df_train[best_feature], 
                         label=df_train[TARGET])
    dvalid = lgb.Dataset(data=df_valid[best_feature], 
                         label=df_valid[TARGET])
    
    model_lgb = lgb.train(params=params,
                          train_set=dtrain,
                          num_boost_round=10000,
                          valid_sets=[dvalid],
                          early_stopping_rounds=50,
                          verbose_eval=100
                          )
    
    return model_lgb

model_lgb = train_lgb(df_train, best_parameters_lgb)

LightGBMError: Check failed: (feature_fraction) <= (1.0) at D:\a\1\s\python-package\compile\src\io\config_auto.cpp, line 359 .


In [116]:
def train_cb(data: pd.DataFrame,
              params: Dict):
    df_train, df_valid = train_test_split(data, 
                                          test_size=0.33,  
                                          random_state=42)
    
    dtrain = xgb.DMatrix(data=df_train[best_feature], 
                         label=df_train[TARGET])
    dvalid = xgb.DMatrix(data=df_valid[best_feature], 
                         label=df_valid[TARGET])
    
    model_cb = cb.CatBoostClassifier(**params)
    model_cb.fit(df_train[best_feature], 
                 df_train[TARGET], 
                 eval_set=[(df_train[best_feature], df_train[TARGET]), 
                           (df_valid[best_feature], df_valid[TARGET])]
                )
    
    return model_cb

model_cb = train_cb(df_train, best_parameters_cb)

CatBoostError: c:/program files (x86)/go agent/pipelines/buildmaster/catboost.git/catboost/private/libs/options/boosting_options.cpp:79: Learning rate should be non-zero

In [117]:
best_parameters_cb

{'colsample_bylevel': 3, 'learning_rate': 0, 'max_depth': 1}