In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import os

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import missingno as msno
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold, StratifiedKFold
import shap
from typing import List, Dict

import pickle

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from scipy.stats import gmean, rankdata

In [2]:
pd.options.display.max_rows = 222
pd.set_option("display.max_columns", 50)

In [3]:
path_applications_history = "../geekbrains-competitive-data-analysis/applications_history.csv"
path_bki = "../geekbrains-competitive-data-analysis/bki.csv"
path_payments = "../geekbrains-competitive-data-analysis/payments.csv"
path_client_profile = "../geekbrains-competitive-data-analysis/client_profile.csv"
path_train = "../geekbrains-competitive-data-analysis/train.csv"
path_test = "../geekbrains-competitive-data-analysis/test.csv"

TARGET = 'TARGET'

In [4]:
def create_freq_feature(data: pd.DataFrame,
                        feature: str
                       ) -> pd.DataFrame:
    freq = data[feature].value_counts()
    data[feature] = data[feature].map(freq).astype('float')
    data[feature] = data[feature].fillna(0.0).astype('float')
    data[feature] = data[feature] / data.shape[0]
    
    return data

In [5]:
# applications_history

def preprocessing_applications_history(data: pd.DataFrame, 
                                       copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---NAME_CONTRACT_TYPE---
    data['NAME_CONTRACT_TYPE'] = (data['NAME_CONTRACT_TYPE'] == 'Cash').astype(int)
    
    # ---AMOUNT_ANNUITY---
    # ---AMOUNT_GOODS_PAYMENT---
    data = data.fillna(value=
                       {'AMOUNT_ANNUITY': 0, 
                        'AMOUNT_GOODS_PAYMENT': 0})
    
    # ---AMOUNT_PAYMENT---
    data.drop(['AMOUNT_PAYMENT'], axis='columns', inplace=True)
    
    # ---NAME_TYPE_SUITE---
    data = create_freq_feature(data, 'NAME_TYPE_SUITE')
    
    # ---NAME_CONTRACT_STATUS---
    freq = pd.Series(data=[1, 0, 0.5, 0.5], index=['Approved', 'Canceled', 'Refused', 'Unused offer'], dtype='float')
    data['NAME_CONTRACT_STATUS'] = data['NAME_CONTRACT_STATUS'].map(freq).astype('float')
    
    # ---NAME_PAYMENT_TYPE---
    data = create_freq_feature(data, 'NAME_PAYMENT_TYPE')
    
    # ---CODE_REJECT_REASON---
    data = create_freq_feature(data, 'CODE_REJECT_REASON')
    
    # ---NAME_CLIENT_TYPE---
    data = create_freq_feature(data, 'NAME_CLIENT_TYPE')
    
    # ---NAME_GOODS_CATEGORY---
    data = create_freq_feature(data, 'NAME_GOODS_CATEGORY')
    
    # ---NAME_PORTFOLIO---
    # ---NAME_PRODUCT_TYPE---
    # ---NAME_YIELD_GROUP---
    data = create_freq_feature(data, 'NAME_PORTFOLIO')
    data = create_freq_feature(data, 'NAME_PRODUCT_TYPE')
    data = create_freq_feature(data, 'NAME_YIELD_GROUP')

    # ---another---
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [6]:
# bki

def preprocessing_bki(data: pd.DataFrame, 
                      copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---CREDIT_ACTIVE---
    # ---CREDIT_CURRENCY---
    # ---CREDIT_TYPE---
    data = create_freq_feature(data, 'CREDIT_ACTIVE')
    data = create_freq_feature(data, 'CREDIT_CURRENCY')
    data = create_freq_feature(data, 'CREDIT_TYPE')
    
    # ---DAYS_CREDIT_ENDDATE---
    mean = data.loc[data['DAYS_CREDIT_ENDDATE'].notnull(), ['DAYS_CREDIT_ENDDATE']].mean()
    data['DAYS_CREDIT_ENDDATE'] = data['DAYS_CREDIT_ENDDATE'].fillna(mean[0])
    
    # ---DAYS_ENDDATE_FACT---
    data.loc[data['DAYS_ENDDATE_FACT'].isnull(), ['DAYS_ENDDATE_FACT']] = data['DAYS_CREDIT_ENDDATE']
    
    # ---AMT_CREDIT_MAX_OVERDUE---
    data['AMT_CREDIT_MAX_OVERDUE'] = data['AMT_CREDIT_MAX_OVERDUE'].fillna(0)
    
    # ---another---
    data.drop(['AMT_ANNUITY'], axis='columns', inplace=True)
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [7]:
# payments

def preprocessing_payments(data: pd.DataFrame, 
                           copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
        
    data.drop('PREV_APPLICATION_NUMBER', axis='columns', inplace=True)
    
    # ---another---
    data = data.fillna(0)    
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.median)
    
    return data

In [8]:
# client_profile

def preprocessing_client_profile(data: pd.DataFrame, 
                                 copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---GENDER---
    data['GENDER'] = (data['GENDER'] == 'F').astype(int)
    
    # ---FAMILY_STATUS---
    # ---EDUCATION_LEVEL---
    data = create_freq_feature(data, 'FAMILY_STATUS')
    data = create_freq_feature(data, 'EDUCATION_LEVEL')
    
    
    # ---OWN_CAR_AGE---
    # ---EXTERNAL_SCORING_RATING_1---
    # ---EXTERNAL_SCORING_RATING_3---
    data = data.fillna(value=
                       {'OWN_CAR_AGE': 0, 
                        'EXTERNAL_SCORING_RATING_1': data['EXTERNAL_SCORING_RATING_1'].min(),
                        'EXTERNAL_SCORING_RATING_3': data['EXTERNAL_SCORING_RATING_3'].min()
                       })   

    # ---another---
    data = data.fillna(0)
    
    data['ratio_credit_to_annuity'] = data['AMOUNT_CREDIT'] / data['AMOUNT_ANNUITY']#
    data["ratio_annuity_to_salary"] = data['AMOUNT_ANNUITY'] / data['TOTAL_SALARY']
    data['ratio_credit_to_salary'] = data['AMOUNT_CREDIT'] / data['TOTAL_SALARY']#
    data["ratio_annuity_to_age"] = data["AMOUNT_ANNUITY"] / data["AGE"]
    data["ratio_credit_to_age"] = data["AMOUNT_CREDIT"] / data["AGE"]
    data["ratio_salary_to_age"] = data["TOTAL_SALARY"] / data["AGE"]
    data["ratio_salary_to_experience"] = data["TOTAL_SALARY"] / data["DAYS_ON_LAST_JOB"]#
    data["ratio_credit_to_experience"] = data["AMOUNT_CREDIT"] / data["DAYS_ON_LAST_JOB"]
    data["ratio_annuity_to_experience"] = data["AMOUNT_ANNUITY"] / data["DAYS_ON_LAST_JOB"]#
    data["ratio_age_to_experience"] = data["AGE"] / data["DAYS_ON_LAST_JOB"]#
    data["ratio_salary_to_region_population"] = data["TOTAL_SALARY"] * data["REGION_POPULATION"]#
    data["ratio_car_to_experience"] = data["OWN_CAR_AGE"] / data["DAYS_ON_LAST_JOB"]
    data["ratio_car_to_age"] = data["OWN_CAR_AGE"] / data["AGE"]
    data["expected_total_loss_1"] = data["EXTERNAL_SCORING_RATING_1"] * data["AMOUNT_CREDIT"]#
    data["expected_total_loss_2"] = data["EXTERNAL_SCORING_RATING_2"] * data["AMOUNT_CREDIT"]
    data["expected_total_loss_3"] = data["EXTERNAL_SCORING_RATING_3"] * data["AMOUNT_CREDIT"]#
    data["expected_monthly_loss_1"] = data["EXTERNAL_SCORING_RATING_1"] * data["AMOUNT_ANNUITY"]#
    data["expected_monthly_loss_2"] = data["EXTERNAL_SCORING_RATING_2"] * data["AMOUNT_ANNUITY"]#
    data["expected_monthly_loss_3"] = data["EXTERNAL_SCORING_RATING_3"] * data["AMOUNT_ANNUITY"]#
    
    return data

In [9]:
# train
# test

def preprocessing_train(data: pd.DataFrame, 
                        copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---NAME_CONTRACT_TYPE---
    data['NAME_CONTRACT_TYPE'] = (data['NAME_CONTRACT_TYPE'] == 'Cash').astype(int)
    
    return data

In [10]:
def data_merege(data: pd.DataFrame, 
                copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    data = data.merge(df_client_profile, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_applications_history, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_bki, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    data = data.merge(df_payments, 
                      left_on='APPLICATION_NUMBER', 
                      right_on='APPLICATION_NUMBER', 
                      how='left')
    
    return data

In [11]:
#read data

df_client_profile = pd.read_csv(path_client_profile)
df_applications_history = pd.read_csv(path_applications_history)
df_bki = pd.read_csv(path_bki)
df_payments = pd.read_csv(path_payments)
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

df_client_profile = preprocessing_client_profile(df_client_profile)
df_applications_history = preprocessing_applications_history(df_applications_history)
df_bki = preprocessing_bki(df_bki)
df_payments = preprocessing_payments(df_payments)

df_train = preprocessing_train(df_train)
df_test = preprocessing_train(df_test)

df_train = data_merege(df_train)
df_test = data_merege(df_test)

In [12]:
def selection_best_feature(data: pd.DataFrame,
                           path: str = 'best_feature.str',
                           plot_bar: bool = False,
                           plot_dot: bool = False) -> List:
    
    if os.path.exists(path):
        print('load save feature')
        with open(path, 'rb') as f:
            return pickle.load(f)
        
    best_feature = list()    
    
    FEATURE = data.columns
    FEATURE = FEATURE.drop(TARGET)
    FEATURE = FEATURE.drop('APPLICATION_NUMBER')
    
    df_train, df_valid = train_test_split(data, 
                                      test_size=0.3,  
                                      random_state=42)
    
    dtrain = xgb.DMatrix(data=data[FEATURE], 
                         label=data[TARGET])
    print('*')

    params = {
        "booster": "gbtree",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.01,
        "reg_lambda": 100,
        "max_depth": 10,
        "gamma": 10,
        "nthread": 6,
        "seed": 27
    }

    model_xgb = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        early_stopping_rounds=50,
        evals=[(dtrain, "train")],
        verbose_eval=200,
        maximize=True,
    )
    
    x_valid_ = df_valid.sample(5000)
    y_valid_ =  x_valid_[TARGET]
    x_valid_ =  x_valid_[FEATURE]
    explainer = shap.TreeExplainer(model_xgb)
    shap_values = explainer.shap_values(x_valid_, y_valid_)
    
    mean = list()

    for j in range(shap_values.shape[1]):
        bufer = 0
        for i in range(shap_values.shape[0]):
            bufer += np.abs(shap_values[i][j])
        bufer /= shap_values.shape[0]
        mean.append(bufer)

    mean = np.array(mean).reshape(-1, shap_values.shape[1])
    
    if plot_bar:
        shap.summary_plot(shap_values, x_valid_, plot_type="bar", max_display=150)
    
    if plot_dot:
        shap.summary_plot(shap_values, x_valid_, plot_type="dot")

    feature_importances = pd.DataFrame(mean, columns=x_valid_.columns, index=['importances'])
    feature_importances = feature_importances.sort_values('importances', axis=1, ascending=0)
    max_num_feature = feature_importances.loc['importances', feature_importances.loc['importances',:]>0].shape[0]
    best_feature = feature_importances.iloc[: ,0:max_num_feature].columns
    
    with open(path, 'wb') as f:
        pickle.dump(best_feature, f)
    
    return best_feature

In [13]:
best_feature = selection_best_feature(df_train)

*
[0]	train-auc:0.542597
Will train until train-auc hasn't improved in 50 rounds.
[200]	train-auc:0.694173
[400]	train-auc:0.725982
[600]	train-auc:0.738633
[800]	train-auc:0.742888
Stopping. Best iteration:
[754]	train-auc:0.742888



In [14]:
def train_xgb(data: pd.DataFrame,
              best_feature: Dict,
              TARGET: str,
#               params: Dict
             ):
    
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    
    estimators, folds_scores = [], []
    
    for i, (train_id, valid_id) in enumerate(cv.split(data[best_feature], data[TARGET])):
        
        x_train, x_valid = data.loc[train_id][best_feature], data.loc[valid_id][best_feature]
        y_train, y_valid = data.loc[train_id][TARGET], data.loc[valid_id][TARGET]      
        
        
        dtrain = xgb.DMatrix(data=x_train, label=y_train)
        dvalid = xgb.DMatrix(data=x_valid, label=y_valid)
        
        params = {
                     "booster": "gbtree",
                     "objective": "binary:logistic",
                     "eval_metric": "auc",
                     "learning_rate": 0.01,
                     "n_estimators": 5000,
                     "reg_lambda": 100,
                     "max_depth": 4,
                     "gamma": 10,
                     "nthread": 6,
                     "seed": 27
                 }
        
        model_xgb = xgb.train(
                              params=params,
                              dtrain=dtrain,
                              num_boost_round=5000,
                              early_stopping_rounds=100,
                              evals=[(dvalid, "valid")],
                              verbose_eval=200,
                              maximize=True,                                     
                             )
        
        pred = model_xgb.predict(dvalid)
        score = roc_auc_score(y_valid, pred)
        print(f"Fold {i+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model_xgb)
        
    
    return estimators, folds_scores

models_xgb, scores = train_xgb(df_train.fillna(-9999), best_feature, TARGET)

[0]	valid-auc:0.499852
Will train until valid-auc hasn't improved in 100 rounds.
[200]	valid-auc:0.698487
[400]	valid-auc:0.716772
[600]	valid-auc:0.727791
[800]	valid-auc:0.732652
Stopping. Best iteration:
[847]	valid-auc:0.733081

Fold 1, Valid score = 0.73306
[0]	valid-auc:0.587569
Will train until valid-auc hasn't improved in 100 rounds.
[200]	valid-auc:0.684492
[400]	valid-auc:0.704386
[600]	valid-auc:0.713172
[800]	valid-auc:0.71748
Stopping. Best iteration:
[819]	valid-auc:0.717653

Fold 2, Valid score = 0.71745
[0]	valid-auc:0.5
Will train until valid-auc hasn't improved in 100 rounds.
[200]	valid-auc:0.692458
[400]	valid-auc:0.70987
[600]	valid-auc:0.718604
[800]	valid-auc:0.722928
Stopping. Best iteration:
[794]	valid-auc:0.722928

Fold 3, Valid score = 0.72293
[0]	valid-auc:0.5
Will train until valid-auc hasn't improved in 100 rounds.
[200]	valid-auc:0.689351
[400]	valid-auc:0.705693
[600]	valid-auc:0.713636
[800]	valid-auc:0.718241
Stopping. Best iteration:
[767]	valid-auc:

In [15]:
df_result = pd.DataFrame({
    'APPLICATION_NUMBER': df_test['APPLICATION_NUMBER'],
    "xgb_model_1": models_xgb[0].predict(xgb.DMatrix(data=df_test[best_feature].fillna(-9999))),
    "xgb_model_2": models_xgb[1].predict(xgb.DMatrix(data=df_test[best_feature].fillna(-9999))),
    "xgb_model_3": models_xgb[2].predict(xgb.DMatrix(data=df_test[best_feature].fillna(-9999))),
    "xgb_model_4": models_xgb[3].predict(xgb.DMatrix(data=df_test[best_feature].fillna(-9999))),
    "xgb_model_5": models_xgb[4].predict(xgb.DMatrix(data=df_test[best_feature].fillna(-9999)))
})

df_result['xgb_result'] = df_result.loc[:, ['xgb_model_1',
                                            'xgb_model_2',
                                            'xgb_model_3',
                                            'xgb_model_4',
                                            'xgb_model_5']].mean(axis=1)

result = df_result.loc[:, ['APPLICATION_NUMBER', 'xgb_result']]
result.rename(columns={'xgb_result': TARGET}, inplace=True)
result.to_csv('xgb.csv', index=False)

In [16]:
def train_lgb(data: pd.DataFrame,
              best_feature: Dict,
              TARGET: str,
#               params: Dict
             ):
    
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    estimators, folds_scores = [], []
    
    for i, (train_id, valid_id) in enumerate(cv.split(data[best_feature], data[TARGET])):
        
        x_train, x_valid = data.loc[train_id][best_feature], data.loc[valid_id][best_feature]
        y_train, y_valid = data.loc[train_id][TARGET], data.loc[valid_id][TARGET]      
        
        dtrain = lgb.Dataset(data=x_train, label=y_train)
        dvalid = lgb.Dataset(data=x_valid, label=y_valid)
                
        params = {
                    "boosting_type": "goss",
                    "objective": "binary",
                    "metric": "auc",
                    "learning_rate": 0.001,
                    "n_jobs": 6,
                    "seed": 27
                 }
        
        model_lgb = lgb.train(
                              params=params,
                              train_set=dtrain,
                              num_boost_round=20000,
                              valid_sets=[dvalid],
                              categorical_feature="auto",
                              early_stopping_rounds=200,
                              verbose_eval=1000
                             )
                
        pred = model_lgb.predict(x_valid)
        score = roc_auc_score(y_valid, pred)
        print(f"Fold {i+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model_lgb)
        
    
    return estimators, folds_scores

models_lgb, scores = train_lgb(df_train.fillna(-9999), best_feature, TARGET)

Training until validation scores don't improve for 200 rounds
[1000]	valid_0's auc: 0.701143
[2000]	valid_0's auc: 0.712286
[3000]	valid_0's auc: 0.71758
[4000]	valid_0's auc: 0.719842
[5000]	valid_0's auc: 0.721159
[6000]	valid_0's auc: 0.721987
Early stopping, best iteration is:
[6705]	valid_0's auc: 0.722572
Fold 1, Valid score = 0.72257
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's auc: 0.710304
[2000]	valid_0's auc: 0.721978
[3000]	valid_0's auc: 0.726707
[4000]	valid_0's auc: 0.729224
[5000]	valid_0's auc: 0.730724
[6000]	valid_0's auc: 0.731732
[7000]	valid_0's auc: 0.732252
Early stopping, best iteration is:
[7149]	valid_0's auc: 0.732388
Fold 2, Valid score = 0.73239
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's auc: 0.71096
[2000]	valid_0's auc: 0.719188
[3000]	valid_0's auc: 0.723595
[4000]	valid_0's auc: 0.725846
[5000]	valid_0's auc: 0.727203
[6000]	valid_0's auc: 0.728064
[7000]	valid_0's auc: 0.728466
Earl

In [17]:
for i, model_lgb in enumerate(models_lgb):
    df_result[f'lgb_model_{i+1}'] = model_lgb.predict(df_test[best_feature].fillna(-9999))

df_result['lgb_result'] = df_result.loc[:, ['lgb_model_1',
                                            'lgb_model_2',
                                            'lgb_model_3',
                                            'lgb_model_4',
                                            'lgb_model_5']].mean(axis=1)

result = df_result.loc[:, ['APPLICATION_NUMBER', 'lgb_result']]
result.rename(columns={'lgb_result': TARGET}, inplace=True)
result.to_csv('lgb.csv', index=False)

In [18]:
def train_cb(data: pd.DataFrame,
              best_feature: Dict,
              TARGET: str,
#               params: Dict
             ):
    
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    estimators, folds_scores = [], []
    
    for i, (train_id, valid_id) in enumerate(cv.split(data[best_feature], data[TARGET])):
        
        x_train, x_valid = data.loc[train_id][best_feature], data.loc[valid_id][best_feature]
        y_train, y_valid = data.loc[train_id][TARGET], data.loc[valid_id][TARGET]      
        
        dtrain = lgb.Dataset(data=x_train, label=y_train)
        dvalid = lgb.Dataset(data=x_valid, label=y_valid)
                
        cb_params = {
                        "n_estimators": 800,
                        "loss_function": "Logloss",
                        "eval_metric": "AUC",
                        "task_type": "CPU",
                        "max_bin": 20,
                        "verbose": 20,
                        "max_depth": 6,
                        "l2_leaf_reg": 100,
                        "early_stopping_rounds": 200,
                        "thread_count": 6,
                        "random_seed": 42
                    }
        
        model_cb = cb.CatBoostClassifier(**cb_params)
        model_cb.fit(x_train, 
                     y_train, 
                     eval_set=[(x_valid, y_valid)]
            )
                
        pred = model_cb.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, pred)
        print(f"Fold {i+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model_cb)
        
    
    return estimators, folds_scores

models_cb, scores = train_cb(df_train.fillna(-9999), best_feature, TARGET)

0:	test: 0.5030399	best: 0.5030399 (0)	total: 104ms	remaining: 1m 22s
20:	test: 0.6867248	best: 0.6867248 (20)	total: 743ms	remaining: 27.6s
40:	test: 0.6974718	best: 0.6977373 (39)	total: 1.33s	remaining: 24.7s
60:	test: 0.7009818	best: 0.7012996 (53)	total: 1.92s	remaining: 23.2s
80:	test: 0.7062333	best: 0.7062333 (80)	total: 2.53s	remaining: 22.4s
100:	test: 0.7093031	best: 0.7094743 (99)	total: 3.32s	remaining: 23s
120:	test: 0.7118616	best: 0.7118616 (120)	total: 4.12s	remaining: 23.1s
140:	test: 0.7140499	best: 0.7140499 (140)	total: 4.72s	remaining: 22.1s
160:	test: 0.7159818	best: 0.7159818 (160)	total: 5.32s	remaining: 21.1s
180:	test: 0.7177635	best: 0.7177635 (180)	total: 5.91s	remaining: 20.2s
200:	test: 0.7190398	best: 0.7190398 (200)	total: 6.52s	remaining: 19.4s
220:	test: 0.7196823	best: 0.7196823 (220)	total: 7.08s	remaining: 18.5s
240:	test: 0.7203690	best: 0.7203690 (240)	total: 7.68s	remaining: 17.8s
260:	test: 0.7210376	best: 0.7210947 (256)	total: 8.25s	remaining

580:	test: 0.7237780	best: 0.7242672 (472)	total: 15.4s	remaining: 5.81s
600:	test: 0.7237609	best: 0.7242672 (472)	total: 15.9s	remaining: 5.28s
620:	test: 0.7239889	best: 0.7242672 (472)	total: 16.5s	remaining: 4.75s
640:	test: 0.7240920	best: 0.7242672 (472)	total: 17s	remaining: 4.22s
660:	test: 0.7242046	best: 0.7243095 (658)	total: 17.5s	remaining: 3.68s
680:	test: 0.7242396	best: 0.7243095 (658)	total: 18s	remaining: 3.15s
700:	test: 0.7239612	best: 0.7243095 (658)	total: 18.5s	remaining: 2.62s
720:	test: 0.7238550	best: 0.7243095 (658)	total: 19s	remaining: 2.09s
740:	test: 0.7240358	best: 0.7243095 (658)	total: 19.6s	remaining: 1.56s
760:	test: 0.7240680	best: 0.7243095 (658)	total: 20.1s	remaining: 1.03s
780:	test: 0.7240822	best: 0.7243095 (658)	total: 20.6s	remaining: 501ms
799:	test: 0.7239944	best: 0.7243095 (658)	total: 21.1s	remaining: 0us

bestTest = 0.7243095431
bestIteration = 658

Shrink model to first 659 iterations.
Fold 3, Valid score = 0.72431
0:	test: 0.5019681

In [19]:
for i, model_cb in enumerate(models_cb):
    df_result[f'cb_model_{i+1}'] = model_cb.predict_proba(df_test[best_feature].fillna(-9999))[:, 1]    

df_result['cb_result'] = df_result.loc[:, ['cb_model_1',
                                           'cb_model_2',
                                           'cb_model_3',
                                           'cb_model_4',
                                           'cb_model_5']].mean(axis=1)

result = df_result.loc[:, ['APPLICATION_NUMBER', 'cb_result']]
result.rename(columns={'cb_result': TARGET}, inplace=True)
result.to_csv('cb.csv', index=False)

In [20]:
df_result['mean'] = df_result.loc[:, ['xgb_result', 'lgb_result', 'cb_result']].mean(axis=1)
df_result['gmean'] = gmean(df_result.loc[:, ['xgb_result', 'lgb_result', 'cb_result']], axis=1)
df_result['rankdata'] = df_result.loc[:, ['xgb_result', 'lgb_result', 'cb_result']].rank().mean(axis=1)


result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['mean']
})
result.to_csv('mean.csv', index=False)

result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['gmean']
})
result.to_csv('gmean.csv', index=False)

result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['rankdata']
})
result.to_csv('rankdata.csv', index=False)

In [21]:
df_result['mean_cb_lgb'] = df_result.loc[:, ['lgb_result', 'cb_result']].mean(axis=1)
df_result['gmean_cb_lgb'] = gmean(df_result.loc[:, ['lgb_result', 'cb_result']], axis=1)
df_result['rankdata_cb_lgb'] = df_result.loc[:, ['lgb_result', 'cb_result']].rank().mean(axis=1)


result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['mean_cb_lgb']
})
result.to_csv('mean_cb_lgb.csv', index=False)

result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['gmean_cb_lgb']
})
result.to_csv('gmean_cb_lgb.csv', index=False)

result = pd.DataFrame({
    'APPLICATION_NUMBER': df_result['APPLICATION_NUMBER'],
    'TARGET': df_result['rankdata_cb_lgb']
})
result.to_csv('rankdata_cb_lgb.csv', index=False)

In [23]:
for i, model in enumerate(models_xgb):
    model.save_model(f'model/model_xgb_{i+1}.json')

for i, model in enumerate(models_lgb):
    model.save_model(f'model/model_lgb_{i+1}.json')
    
for i, model in enumerate(models_cb):
    model.save_model(fname=f'model/model_cb_{i+1}.json')