In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, auc, mean_squared_error, roc_curve, confusion_matrix, precision_score, recall_score, f1_score,\
log_loss, roc_auc_score,make_scorer
import gc
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool, cv
import hyperopt
import pprint


#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from time import time

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
test = pd.read_csv("test_fill.csv", sep=',', index_col=[0])
train = pd.read_csv("train_fill.csv", sep = ',', index_col=[0])
submission = pd.read_csv("submission.csv", sep=',',index_col=[0])


In [3]:
print(test.shape, train.shape)

(24000, 51) (56000, 52)


In [4]:
train.head()


Unnamed: 0,Applicant_ID,Creditworthiness_score,CreditEnquiredRisk,LoanDefaultSeverityAll,LoanDefaultSeverityAuto,LoanDefaultSeverityEdu,MinCreditAvailAfterPayments,MaxCreditAvailActiveAll,MaxCreditAvailAfterPayments,SumOfAvailCreditMissed1Pay,...,FinanceStressIndex,NumberCLinesPaidHighRisk,DebtRatio1Max,NumOfHomeLoansMissed2x,NumOfAutoLoansMissed2x,ProductApplicatn,form_field48,form_field49,DebtRatio1Min,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,0,-999.0,1.129518,0.044335,0
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,-999.0,0.0,0,349.80573,1.620483,0.322436,0
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,-999.0,956940.0,-999.0,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,0,-999.0,1.51337,0.01164,1
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,-999.0,0.0,0,89.9401,0.664452,0.082729,0
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,0,97.887502,1.427891,0.04563,0


In [5]:
test.head()

Unnamed: 0,Applicant_ID,Creditworthiness_score,CreditEnquiredRisk,LoanDefaultSeverityAll,LoanDefaultSeverityAuto,LoanDefaultSeverityEdu,MinCreditAvailAfterPayments,MaxCreditAvailActiveAll,MaxCreditAvailAfterPayments,SumOfAvailCreditMissed1Pay,...,%UseCLinesonAutoLoan,FinanceStressIndex,NumberCLinesPaidHighRisk,DebtRatio1Max,NumOfHomeLoansMissed2x,NumOfAutoLoansMissed2x,ProductApplicatn,form_field48,form_field49,DebtRatio1Min
0,Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,...,-999.0,0.825,1.01,0.8,-999.0,0.0,0,-999.0,0.0,0.011221
1,Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,1,-999.0,0.504974,0.043525
2,Apcnt_1000052,-999.0,0.27505,0.06,0.0,0.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,0,-999.0,0.0,-999.0
3,Apcnt_1000076,3232.0,0.28505,2.8032,0.0,0.0,0.0,473802.0,473802.0,1724437.0,...,-999.0,0.916663,2.02,0.464224,-999.0,-999.0,0,90.163742,0.788809,0.104029
4,Apcnt_1000080,3466.0,2.09545,0.8318,2.5182,0.0,19839.0,1150662.0,1150662.0,7860523.0,...,-999.0,0.234047,23.23,0.726688,0.0,0.0,1,1303.587148,1.637733,0.163124


In [6]:
test_d = test.drop(['Applicant_ID'], axis=1)

In [7]:
train_d = train.drop(['default_status', 'Applicant_ID','form_field48','form_field49'], axis=1)

In [8]:
print(train_d.shape, test_d.shape)

(56000, 48) (24000, 50)


In [9]:
target = train['default_status']

In [10]:
X = train_d
y = target

In [11]:
#categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size=.85,random_state=1234)

## CatBoost Baseline

In [13]:
#let us make the catboost model, use_best_model params will make the model prevent overfitting
base_model = CatBoostClassifier(verbose=True)
base_model.fit(xtrain,ytrain,)

Learning rate set to 0.053613
0:	learn: 0.6601483	total: 97.4ms	remaining: 1m 37s
1:	learn: 0.6306148	total: 143ms	remaining: 1m 11s
2:	learn: 0.6060590	total: 185ms	remaining: 1m 1s
3:	learn: 0.5836109	total: 253ms	remaining: 1m 2s
4:	learn: 0.5645730	total: 304ms	remaining: 1m
5:	learn: 0.5479584	total: 344ms	remaining: 57s
6:	learn: 0.5346178	total: 390ms	remaining: 55.3s
7:	learn: 0.5221454	total: 433ms	remaining: 53.6s
8:	learn: 0.5105946	total: 497ms	remaining: 54.7s
9:	learn: 0.5001608	total: 538ms	remaining: 53.3s
10:	learn: 0.4924601	total: 581ms	remaining: 52.2s
11:	learn: 0.4857018	total: 623ms	remaining: 51.3s
12:	learn: 0.4785848	total: 661ms	remaining: 50.2s
13:	learn: 0.4728072	total: 722ms	remaining: 50.9s
14:	learn: 0.4670582	total: 825ms	remaining: 54.2s
15:	learn: 0.4623606	total: 995ms	remaining: 1m 1s
16:	learn: 0.4588180	total: 1.09s	remaining: 1m 3s
17:	learn: 0.4550910	total: 1.18s	remaining: 1m 4s
18:	learn: 0.4512029	total: 1.33s	remaining: 1m 8s
19:	learn: 0.

160:	learn: 0.3945137	total: 13.5s	remaining: 1m 10s
161:	learn: 0.3944237	total: 13.5s	remaining: 1m 10s
162:	learn: 0.3943118	total: 13.6s	remaining: 1m 9s
163:	learn: 0.3942103	total: 13.7s	remaining: 1m 9s
164:	learn: 0.3941041	total: 13.8s	remaining: 1m 9s
165:	learn: 0.3940435	total: 13.8s	remaining: 1m 9s
166:	learn: 0.3939265	total: 13.9s	remaining: 1m 9s
167:	learn: 0.3938628	total: 13.9s	remaining: 1m 8s
168:	learn: 0.3937430	total: 14s	remaining: 1m 8s
169:	learn: 0.3936454	total: 14.1s	remaining: 1m 8s
170:	learn: 0.3935546	total: 14.2s	remaining: 1m 8s
171:	learn: 0.3934432	total: 14.3s	remaining: 1m 8s
172:	learn: 0.3933343	total: 14.4s	remaining: 1m 8s
173:	learn: 0.3932205	total: 14.6s	remaining: 1m 9s
174:	learn: 0.3931502	total: 14.7s	remaining: 1m 9s
175:	learn: 0.3930256	total: 14.7s	remaining: 1m 8s
176:	learn: 0.3929007	total: 14.8s	remaining: 1m 8s
177:	learn: 0.3928520	total: 14.9s	remaining: 1m 8s
178:	learn: 0.3927717	total: 15s	remaining: 1m 8s
179:	learn: 0.

320:	learn: 0.3785484	total: 27.1s	remaining: 57.3s
321:	learn: 0.3784486	total: 27.4s	remaining: 57.7s
322:	learn: 0.3783418	total: 27.9s	remaining: 58.4s
323:	learn: 0.3782510	total: 28.1s	remaining: 58.5s
324:	learn: 0.3781698	total: 28.2s	remaining: 58.6s
325:	learn: 0.3781649	total: 28.3s	remaining: 58.5s
326:	learn: 0.3780677	total: 28.4s	remaining: 58.4s
327:	learn: 0.3779592	total: 28.4s	remaining: 58.3s
328:	learn: 0.3778832	total: 28.5s	remaining: 58.1s
329:	learn: 0.3778220	total: 28.5s	remaining: 57.9s
330:	learn: 0.3777310	total: 28.6s	remaining: 57.9s
331:	learn: 0.3776573	total: 28.7s	remaining: 57.8s
332:	learn: 0.3775351	total: 28.9s	remaining: 57.8s
333:	learn: 0.3774658	total: 29s	remaining: 57.8s
334:	learn: 0.3773593	total: 29.1s	remaining: 57.7s
335:	learn: 0.3773166	total: 29.1s	remaining: 57.6s
336:	learn: 0.3772078	total: 29.2s	remaining: 57.5s
337:	learn: 0.3771150	total: 29.3s	remaining: 57.3s
338:	learn: 0.3769711	total: 29.4s	remaining: 57.3s
339:	learn: 0.

481:	learn: 0.3636166	total: 40.6s	remaining: 43.7s
482:	learn: 0.3635141	total: 40.7s	remaining: 43.6s
483:	learn: 0.3634094	total: 40.8s	remaining: 43.5s
484:	learn: 0.3633213	total: 40.9s	remaining: 43.4s
485:	learn: 0.3632522	total: 41s	remaining: 43.4s
486:	learn: 0.3631833	total: 41.1s	remaining: 43.3s
487:	learn: 0.3631528	total: 41.2s	remaining: 43.2s
488:	learn: 0.3630637	total: 41.3s	remaining: 43.1s
489:	learn: 0.3629863	total: 41.3s	remaining: 43s
490:	learn: 0.3628956	total: 41.4s	remaining: 42.9s
491:	learn: 0.3627692	total: 41.5s	remaining: 42.8s
492:	learn: 0.3627046	total: 41.5s	remaining: 42.7s
493:	learn: 0.3626216	total: 41.6s	remaining: 42.6s
494:	learn: 0.3625015	total: 41.7s	remaining: 42.5s
495:	learn: 0.3624347	total: 41.7s	remaining: 42.4s
496:	learn: 0.3623466	total: 41.7s	remaining: 42.2s
497:	learn: 0.3622584	total: 41.8s	remaining: 42.1s
498:	learn: 0.3621843	total: 41.8s	remaining: 42s
499:	learn: 0.3620943	total: 41.9s	remaining: 41.9s
500:	learn: 0.3620

644:	learn: 0.3502804	total: 51.9s	remaining: 28.6s
645:	learn: 0.3502074	total: 51.9s	remaining: 28.5s
646:	learn: 0.3501210	total: 52s	remaining: 28.4s
647:	learn: 0.3499961	total: 52s	remaining: 28.3s
648:	learn: 0.3499008	total: 52.1s	remaining: 28.2s
649:	learn: 0.3498231	total: 52.2s	remaining: 28.1s
650:	learn: 0.3497480	total: 52.2s	remaining: 28s
651:	learn: 0.3496704	total: 52.3s	remaining: 27.9s
652:	learn: 0.3495405	total: 52.3s	remaining: 27.8s
653:	learn: 0.3494794	total: 52.4s	remaining: 27.7s
654:	learn: 0.3494125	total: 52.4s	remaining: 27.6s
655:	learn: 0.3493566	total: 52.5s	remaining: 27.5s
656:	learn: 0.3492807	total: 52.5s	remaining: 27.4s
657:	learn: 0.3491873	total: 52.5s	remaining: 27.3s
658:	learn: 0.3491036	total: 52.6s	remaining: 27.2s
659:	learn: 0.3490109	total: 52.7s	remaining: 27.1s
660:	learn: 0.3488854	total: 52.7s	remaining: 27s
661:	learn: 0.3488564	total: 52.8s	remaining: 27s
662:	learn: 0.3487816	total: 52.9s	remaining: 26.9s
663:	learn: 0.3487341	

806:	learn: 0.3378308	total: 1m 1s	remaining: 14.6s
807:	learn: 0.3377600	total: 1m 1s	remaining: 14.5s
808:	learn: 0.3376893	total: 1m 1s	remaining: 14.5s
809:	learn: 0.3376123	total: 1m 1s	remaining: 14.4s
810:	learn: 0.3375281	total: 1m 1s	remaining: 14.3s
811:	learn: 0.3374614	total: 1m 1s	remaining: 14.3s
812:	learn: 0.3373850	total: 1m 1s	remaining: 14.2s
813:	learn: 0.3373116	total: 1m 1s	remaining: 14.1s
814:	learn: 0.3372484	total: 1m 1s	remaining: 14s
815:	learn: 0.3371806	total: 1m 1s	remaining: 14s
816:	learn: 0.3371021	total: 1m 2s	remaining: 13.9s
817:	learn: 0.3370103	total: 1m 2s	remaining: 13.9s
818:	learn: 0.3369267	total: 1m 2s	remaining: 13.8s
819:	learn: 0.3368758	total: 1m 2s	remaining: 13.7s
820:	learn: 0.3367652	total: 1m 2s	remaining: 13.6s
821:	learn: 0.3366838	total: 1m 2s	remaining: 13.5s
822:	learn: 0.3366029	total: 1m 2s	remaining: 13.5s
823:	learn: 0.3365410	total: 1m 2s	remaining: 13.4s
824:	learn: 0.3364567	total: 1m 2s	remaining: 13.3s
825:	learn: 0.33

963:	learn: 0.3270131	total: 1m 16s	remaining: 2.86s
964:	learn: 0.3269569	total: 1m 16s	remaining: 2.78s
965:	learn: 0.3269073	total: 1m 16s	remaining: 2.7s
966:	learn: 0.3268173	total: 1m 16s	remaining: 2.62s
967:	learn: 0.3267892	total: 1m 16s	remaining: 2.54s
968:	learn: 0.3266897	total: 1m 16s	remaining: 2.46s
969:	learn: 0.3266129	total: 1m 16s	remaining: 2.38s
970:	learn: 0.3265310	total: 1m 16s	remaining: 2.3s
971:	learn: 0.3264650	total: 1m 17s	remaining: 2.22s
972:	learn: 0.3263923	total: 1m 17s	remaining: 2.14s
973:	learn: 0.3263091	total: 1m 17s	remaining: 2.06s
974:	learn: 0.3262710	total: 1m 17s	remaining: 1.98s
975:	learn: 0.3262212	total: 1m 17s	remaining: 1.9s
976:	learn: 0.3261419	total: 1m 17s	remaining: 1.82s
977:	learn: 0.3260802	total: 1m 17s	remaining: 1.74s
978:	learn: 0.3260035	total: 1m 17s	remaining: 1.66s
979:	learn: 0.3259323	total: 1m 17s	remaining: 1.58s
980:	learn: 0.3258569	total: 1m 17s	remaining: 1.5s
981:	learn: 0.3258197	total: 1m 17s	remaining: 1.4

<catboost.core.CatBoostClassifier at 0x7ff38ec79710>

In [14]:
y_pred = base_model.predict_proba(xtest)[:, 1]
valid_score = roc_auc_score(ytest, y_pred)
print('Validation ROC-AUC score:', valid_score)

Validation ROC-AUC score: 0.8295268168692593


In [15]:
pred_base = base_model.predict_proba(test_d)[:,1]

In [16]:
pred_base

array([0.41504254, 0.37504723, 0.43579919, ..., 0.29037169, 0.5327252 ,
       0.17772394])

In [18]:
submissions = pd.DataFrame()
submissions['Applicant_ID'] = test['Applicant_ID']
submissions['default_status'] = pred_base
submissions.to_csv('submission_13_c.csv', index=False, float_format='%.8f')

## Bayesian Optimization

In [19]:
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [22]:
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [23]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                         od_type = 'Iter',
                         verbose= False
                        )

In [24]:
# Defining your search space
search_spaces = {'iterations': (10, 1000),
                 'depth': (1, 8),
                 'learning_rate': (0.01, 1.0, 'log-uniform'),
                 'random_strength': (1e-9, 10, 'log-uniform'),
                 'bagging_temperature':(0.0, 1.0),
                 'border_count':(1, 255),
                 'l2_leaf_reg': (2, 30),
                 'scale_pos_weight':(0.01, 1.0, 'uniform')}

In [25]:
# Setting up BayesSearchCV
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=roc_auc,
                    cv=fold,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)

In [26]:
best_params = report_perf(opt, X, y,'CatBoost', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 145.9644
Function value obtained: -0.8395
Current minimum: -0.8395
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 227.8067
Function value obtained: -0.8357
Current minimum: -0.8395
Iteration No: 3 started. Searching for the next optimal point.
CatBoost took 397.33 seconds,  candidates checked: 2, best CV score: 0.839 ± 0.006
Best parameters:
OrderedDict([('bagging_temperature', 0.41010395885331385),
             ('border_count', 186),
             ('depth', 8),
             ('iterations', 323),
             ('l2_leaf_reg', 21),
             ('learning_rate', 0.0673344419215237),
             ('random_strength', 4),
             ('scale_pos_weight', 0.7421091918485163)])



In [27]:
best_params['iterations']=1000

In [28]:
tuned_model = CatBoostClassifier(**best_params,od_type='Iter', eval_metric = 'AUC', loss_function = 'Logloss', \
                                  random_state = 42)
tuned_model.fit(xtrain,ytrain)

0:	total: 76.2ms	remaining: 1m 16s
1:	total: 152ms	remaining: 1m 15s
2:	total: 224ms	remaining: 1m 14s
3:	total: 674ms	remaining: 2m 47s
4:	total: 776ms	remaining: 2m 34s
5:	total: 994ms	remaining: 2m 44s
6:	total: 1.13s	remaining: 2m 39s
7:	total: 1.25s	remaining: 2m 35s
8:	total: 1.58s	remaining: 2m 54s
9:	total: 1.68s	remaining: 2m 46s
10:	total: 1.75s	remaining: 2m 37s
11:	total: 1.87s	remaining: 2m 33s
12:	total: 1.94s	remaining: 2m 27s
13:	total: 2s	remaining: 2m 21s
14:	total: 2.1s	remaining: 2m 17s
15:	total: 2.23s	remaining: 2m 17s
16:	total: 2.39s	remaining: 2m 18s
17:	total: 2.54s	remaining: 2m 18s
18:	total: 2.73s	remaining: 2m 20s
19:	total: 3s	remaining: 2m 26s
20:	total: 3.32s	remaining: 2m 34s
21:	total: 3.45s	remaining: 2m 33s
22:	total: 3.72s	remaining: 2m 38s
23:	total: 3.79s	remaining: 2m 34s
24:	total: 3.86s	remaining: 2m 30s
25:	total: 3.99s	remaining: 2m 29s
26:	total: 4.3s	remaining: 2m 34s
27:	total: 4.39s	remaining: 2m 32s
28:	total: 4.47s	remaining: 2m 29s
29

235:	total: 21.2s	remaining: 1m 8s
236:	total: 21.2s	remaining: 1m 8s
237:	total: 21.3s	remaining: 1m 8s
238:	total: 21.4s	remaining: 1m 8s
239:	total: 21.5s	remaining: 1m 8s
240:	total: 21.6s	remaining: 1m 7s
241:	total: 21.6s	remaining: 1m 7s
242:	total: 21.7s	remaining: 1m 7s
243:	total: 21.7s	remaining: 1m 7s
244:	total: 21.8s	remaining: 1m 7s
245:	total: 21.9s	remaining: 1m 7s
246:	total: 21.9s	remaining: 1m 6s
247:	total: 22.1s	remaining: 1m 6s
248:	total: 22.1s	remaining: 1m 6s
249:	total: 22.2s	remaining: 1m 6s
250:	total: 22.3s	remaining: 1m 6s
251:	total: 22.4s	remaining: 1m 6s
252:	total: 22.4s	remaining: 1m 6s
253:	total: 22.5s	remaining: 1m 6s
254:	total: 22.6s	remaining: 1m 5s
255:	total: 22.7s	remaining: 1m 5s
256:	total: 22.7s	remaining: 1m 5s
257:	total: 22.8s	remaining: 1m 5s
258:	total: 22.9s	remaining: 1m 5s
259:	total: 23s	remaining: 1m 5s
260:	total: 23s	remaining: 1m 5s
261:	total: 23.1s	remaining: 1m 5s
262:	total: 23.2s	remaining: 1m 4s
263:	total: 23.3s	remain

474:	total: 38.5s	remaining: 42.5s
475:	total: 38.6s	remaining: 42.4s
476:	total: 38.6s	remaining: 42.3s
477:	total: 38.8s	remaining: 42.4s
478:	total: 38.9s	remaining: 42.4s
479:	total: 39s	remaining: 42.3s
480:	total: 39.1s	remaining: 42.2s
481:	total: 39.1s	remaining: 42.1s
482:	total: 39.3s	remaining: 42.1s
483:	total: 39.3s	remaining: 41.9s
484:	total: 39.4s	remaining: 41.9s
485:	total: 39.5s	remaining: 41.8s
486:	total: 39.6s	remaining: 41.7s
487:	total: 39.6s	remaining: 41.6s
488:	total: 39.7s	remaining: 41.5s
489:	total: 39.8s	remaining: 41.4s
490:	total: 39.8s	remaining: 41.3s
491:	total: 39.9s	remaining: 41.2s
492:	total: 40s	remaining: 41.1s
493:	total: 40s	remaining: 41s
494:	total: 40.1s	remaining: 40.9s
495:	total: 40.2s	remaining: 40.8s
496:	total: 40.3s	remaining: 40.7s
497:	total: 40.3s	remaining: 40.7s
498:	total: 40.4s	remaining: 40.6s
499:	total: 40.5s	remaining: 40.5s
500:	total: 40.5s	remaining: 40.4s
501:	total: 40.6s	remaining: 40.3s
502:	total: 40.7s	remaining:

712:	total: 58.6s	remaining: 23.6s
713:	total: 58.7s	remaining: 23.5s
714:	total: 58.8s	remaining: 23.4s
715:	total: 58.9s	remaining: 23.4s
716:	total: 58.9s	remaining: 23.3s
717:	total: 59s	remaining: 23.2s
718:	total: 59.1s	remaining: 23.1s
719:	total: 59.1s	remaining: 23s
720:	total: 59.2s	remaining: 22.9s
721:	total: 59.3s	remaining: 22.8s
722:	total: 59.3s	remaining: 22.7s
723:	total: 59.4s	remaining: 22.6s
724:	total: 59.5s	remaining: 22.6s
725:	total: 59.5s	remaining: 22.5s
726:	total: 59.6s	remaining: 22.4s
727:	total: 59.7s	remaining: 22.3s
728:	total: 59.8s	remaining: 22.2s
729:	total: 59.9s	remaining: 22.1s
730:	total: 60s	remaining: 22.1s
731:	total: 1m	remaining: 22s
732:	total: 1m	remaining: 21.9s
733:	total: 1m	remaining: 21.8s
734:	total: 1m	remaining: 21.7s
735:	total: 1m	remaining: 21.7s
736:	total: 1m	remaining: 21.6s
737:	total: 1m	remaining: 21.5s
738:	total: 1m	remaining: 21.4s
739:	total: 1m	remaining: 21.3s
740:	total: 1m	remaining: 21.3s
741:	total: 1m	remainin

944:	total: 1m 25s	remaining: 4.95s
945:	total: 1m 25s	remaining: 4.86s
946:	total: 1m 25s	remaining: 4.77s
947:	total: 1m 25s	remaining: 4.68s
948:	total: 1m 25s	remaining: 4.59s
949:	total: 1m 25s	remaining: 4.5s
950:	total: 1m 25s	remaining: 4.41s
951:	total: 1m 25s	remaining: 4.32s
952:	total: 1m 25s	remaining: 4.23s
953:	total: 1m 25s	remaining: 4.13s
954:	total: 1m 25s	remaining: 4.04s
955:	total: 1m 25s	remaining: 3.95s
956:	total: 1m 25s	remaining: 3.86s
957:	total: 1m 26s	remaining: 3.77s
958:	total: 1m 26s	remaining: 3.68s
959:	total: 1m 26s	remaining: 3.59s
960:	total: 1m 26s	remaining: 3.5s
961:	total: 1m 26s	remaining: 3.41s
962:	total: 1m 26s	remaining: 3.32s
963:	total: 1m 26s	remaining: 3.23s
964:	total: 1m 26s	remaining: 3.14s
965:	total: 1m 26s	remaining: 3.05s
966:	total: 1m 26s	remaining: 2.96s
967:	total: 1m 26s	remaining: 2.87s
968:	total: 1m 26s	remaining: 2.78s
969:	total: 1m 27s	remaining: 2.69s
970:	total: 1m 27s	remaining: 2.6s
971:	total: 1m 27s	remaining: 2

<catboost.core.CatBoostClassifier at 0x7ff377de7a50>

In [29]:
pred_b = tuned_model.predict_proba(xtest)[:, 1]
valid_score = roc_auc_score(ytest, y_pred)
print('Validation ROC-AUC score:', valid_score)

Validation ROC-AUC score: 0.8295268168692593


In [30]:
pred_2 = tuned_model.predict_proba(test_d)[:,1]

In [31]:
pred_2

array([0.25350706, 0.35812851, 0.36833628, ..., 0.19122529, 0.51378776,
       0.1215896 ])

In [33]:
submissions = pd.DataFrame()
submissions['Applicant_ID'] = test['Applicant_ID']
submissions['default_status'] = pred_2
submissions.to_csv('submission_13_tunedc.csv', index=False, float_format='%.8f')


## Blending

In [None]:
df_base0 = pd.read_csv('submission_13_5.csv',names=["Applicant_ID","default_status"], skiprows=[0],header=None)
df_base1 = pd.read_csv('submission_15.csv',names=["Applicant_ID","default_status"], skiprows=[0],header=None)
df_base2 = pd.read_csv('submission_11.csv',names=["Applicant_ID","default_status"], skiprows=[0],header=None)
df_base3 = pd.read_csv('submission_13_4.csv',names=["Applicant_ID","default_status"], skiprows=[0],header=None)

In [None]:
df_base = pd.merge(df_base0,df_base1,how='inner',on='Applicant_ID')
df_base = pd.merge(df_base,df_base2,how='inner',on='Applicant_ID')
df_base = pd.merge(df_base,df_base3,how='inner',on='Applicant_ID')

In [None]:
df_base

In [None]:
#CORRELATION MATRIX (Pearson Correlation to measure how similar are 2 solutions)
plt.figure(figsize=(20,20))
sns.heatmap(df_base.iloc[:,1:].corr(),annot=True,fmt=".2f")

In [None]:
#SOLUTION = MEAN OF COLUMNS
df_base['default_status'] = df_base.iloc[:,1:].mean(axis=1)

In [None]:
#GENERATING FINAL SOLUTION
df_base[['Applicant_ID','default_status']].to_csv("blending.csv",index=False)