In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score

from os.path import join as path_join

import warnings
warnings.filterwarnings('ignore')

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
num_rounds = 10000

lgb_parameters = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'max_depth': 8,
    'num_threads': 4,
    'metric': 'auc',
    'seed': 42,
    
    # regularization
    'colsample_bytree': 0.65,
    'subsample': 0.8,
    'subsample_freq': 2,
    'min_data_in_leaf': 20,
}

In [6]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [7]:
def write_pred(file_name, prediction):
    Y_test['0'] = prediction
    Y_test.to_csv(path_join(CSV_DIR, file_name), index=False)
    
def submit_model(x_train, y_train, x_test, file_name, n_estimators=785):
    lgb_parameters['n_estimators'] = n_estimators
    lgb_model = lgb.LGBMClassifier(**lgb_parameters)
    del lgb_parameters['n_estimators']
    
    lgb_model.fit(x_train, y_train)

    lgb_prediction = lgb_model.predict_proba(x_test)[:, 1]
    
    write_pred(file_name, lgb_prediction)

In [8]:
def built_in_validation(x_train, y_train):
    lgb_train = lgb.Dataset(x_train, label=y_train, free_raw_data=False)
    
    result = lgb.cv(lgb_parameters, 
                    lgb_train, 
                    num_rounds, 
                    folds=skf.split(x_train, y_train), 
                    early_stopping_rounds=10, 
                    verbose_eval=1, 
                   )
    return result['auc-mean'][-1]

def self_validation(x_train, y_train):
    lgb_data = lgb.Dataset(x_train, label=y_train, free_raw_data=False)
    
    roc_auc = []
    
    for train, val in skf.split(x_train, y_train):
        temp_lgb_train = lgb_data.subset(train)
        temp_lgb_val = lgb_data.subset(val)
        
        temp_model = lgb.train(lgb_parameters, 
                               temp_lgb_train, 
                               num_rounds, 
                               verbose_eval=1,
                              )
        
        roc_auc.append(roc_auc_score(y_train.iloc[val], temp_model.predict(x_train.iloc[val])))
        
    return np.mean(roc_auc)

In [9]:
print(len(X_train.columns))
X_train.columns

217


Index(['1', '2', '4', '7', '9', '10', '12', '13', '15', '18',
       ...
       '335', '336', '337', '338', '339', '341', '342', '343', '344', '345'],
      dtype='object', length=217)

## Base validation
- 0.7434418312346683
- LB: 0.75920944

In [9]:
built_in_validation(X_train, Y_train)

[1]	cv_agg's auc: 0.678926 + 0.00429698
[2]	cv_agg's auc: 0.689579 + 0.001455
[3]	cv_agg's auc: 0.696494 + 0.00141874
[4]	cv_agg's auc: 0.697844 + 0.00118953
[5]	cv_agg's auc: 0.701302 + 0.00198635
[6]	cv_agg's auc: 0.70322 + 0.00223348
[7]	cv_agg's auc: 0.703758 + 0.00203186
[8]	cv_agg's auc: 0.705953 + 0.00214424
[9]	cv_agg's auc: 0.707122 + 0.00209879
[10]	cv_agg's auc: 0.708326 + 0.0028066
[11]	cv_agg's auc: 0.709371 + 0.00209698
[12]	cv_agg's auc: 0.709469 + 0.00205117
[13]	cv_agg's auc: 0.710336 + 0.00233921
[14]	cv_agg's auc: 0.710444 + 0.00188568
[15]	cv_agg's auc: 0.710634 + 0.00194912
[16]	cv_agg's auc: 0.710678 + 0.00251907
[17]	cv_agg's auc: 0.710333 + 0.0024917
[18]	cv_agg's auc: 0.710301 + 0.00218919
[19]	cv_agg's auc: 0.710518 + 0.00242687
[20]	cv_agg's auc: 0.71078 + 0.0023683
[21]	cv_agg's auc: 0.711655 + 0.00265831
[22]	cv_agg's auc: 0.711456 + 0.0026224
[23]	cv_agg's auc: 0.7116 + 0.00270704
[24]	cv_agg's auc: 0.71174 + 0.00274453
[25]	cv_agg's auc: 0.711464 + 0.0026

[200]	cv_agg's auc: 0.728999 + 0.00218013
[201]	cv_agg's auc: 0.729045 + 0.00225611
[202]	cv_agg's auc: 0.72915 + 0.00224812
[203]	cv_agg's auc: 0.72928 + 0.0022399
[204]	cv_agg's auc: 0.729357 + 0.00220365
[205]	cv_agg's auc: 0.729448 + 0.00221962
[206]	cv_agg's auc: 0.729533 + 0.00221034
[207]	cv_agg's auc: 0.729626 + 0.00222399
[208]	cv_agg's auc: 0.729693 + 0.00217772
[209]	cv_agg's auc: 0.729745 + 0.00220932
[210]	cv_agg's auc: 0.729826 + 0.00214744
[211]	cv_agg's auc: 0.729929 + 0.00217166
[212]	cv_agg's auc: 0.730025 + 0.0021864
[213]	cv_agg's auc: 0.730099 + 0.00218978
[214]	cv_agg's auc: 0.730199 + 0.00221476
[215]	cv_agg's auc: 0.730273 + 0.00222143
[216]	cv_agg's auc: 0.730315 + 0.00223862
[217]	cv_agg's auc: 0.730425 + 0.0022353
[218]	cv_agg's auc: 0.73057 + 0.00220976
[219]	cv_agg's auc: 0.73065 + 0.0022739
[220]	cv_agg's auc: 0.730754 + 0.00225145
[221]	cv_agg's auc: 0.730841 + 0.00229771
[222]	cv_agg's auc: 0.730941 + 0.00231863
[223]	cv_agg's auc: 0.730974 + 0.00232628


[399]	cv_agg's auc: 0.738723 + 0.00269726
[400]	cv_agg's auc: 0.73871 + 0.00269615
[401]	cv_agg's auc: 0.738686 + 0.00271514
[402]	cv_agg's auc: 0.738658 + 0.00272422
[403]	cv_agg's auc: 0.738688 + 0.0027073
[404]	cv_agg's auc: 0.738676 + 0.00272869
[405]	cv_agg's auc: 0.738716 + 0.00271447
[406]	cv_agg's auc: 0.738752 + 0.00271535
[407]	cv_agg's auc: 0.738784 + 0.00272579
[408]	cv_agg's auc: 0.73881 + 0.00273706
[409]	cv_agg's auc: 0.73881 + 0.00270833
[410]	cv_agg's auc: 0.738824 + 0.00274267
[411]	cv_agg's auc: 0.738818 + 0.00274502
[412]	cv_agg's auc: 0.738836 + 0.00272552
[413]	cv_agg's auc: 0.738906 + 0.00274349
[414]	cv_agg's auc: 0.738883 + 0.00271419
[415]	cv_agg's auc: 0.738913 + 0.00272471
[416]	cv_agg's auc: 0.73891 + 0.00270887
[417]	cv_agg's auc: 0.738914 + 0.00272157
[418]	cv_agg's auc: 0.738916 + 0.00270583
[419]	cv_agg's auc: 0.738941 + 0.00269181
[420]	cv_agg's auc: 0.738964 + 0.00271987
[421]	cv_agg's auc: 0.738957 + 0.00275543
[422]	cv_agg's auc: 0.738974 + 0.002730

[595]	cv_agg's auc: 0.741655 + 0.00303445
[596]	cv_agg's auc: 0.741652 + 0.00305518
[597]	cv_agg's auc: 0.741731 + 0.00302741
[598]	cv_agg's auc: 0.741756 + 0.0030536
[599]	cv_agg's auc: 0.741738 + 0.00308941
[600]	cv_agg's auc: 0.741787 + 0.00310532
[601]	cv_agg's auc: 0.741812 + 0.00314317
[602]	cv_agg's auc: 0.741835 + 0.00313239
[603]	cv_agg's auc: 0.741853 + 0.00314722
[604]	cv_agg's auc: 0.741881 + 0.00314325
[605]	cv_agg's auc: 0.741876 + 0.00311975
[606]	cv_agg's auc: 0.741918 + 0.00311512
[607]	cv_agg's auc: 0.741965 + 0.0031728
[608]	cv_agg's auc: 0.741998 + 0.00317846
[609]	cv_agg's auc: 0.742018 + 0.0031802
[610]	cv_agg's auc: 0.742049 + 0.00313462
[611]	cv_agg's auc: 0.742062 + 0.00315472
[612]	cv_agg's auc: 0.742041 + 0.00318526
[613]	cv_agg's auc: 0.742046 + 0.00319282
[614]	cv_agg's auc: 0.74205 + 0.00316818
[615]	cv_agg's auc: 0.742075 + 0.00316113
[616]	cv_agg's auc: 0.742125 + 0.00317003
[617]	cv_agg's auc: 0.742124 + 0.00317534
[618]	cv_agg's auc: 0.742107 + 0.00315

0.7434418312346683

In [10]:
self_validation(X_train, Y_train)

## Dimensionality Reduction 
- PCA: 0.7359386908763632
- ICA: 0.7369462643917165

In [29]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA

In [13]:
len(X_train.columns)

217

In [27]:
%%time
pca = PCA(n_components=210, random_state=42)

pca.fit(X_train)

X_pca = pca.transform(X_train)

In [None]:
%%time
print(built_in_validation(X_pca, Y_train))

In [31]:
%%time
ica = FastICA(n_components=200, random_state=42)

ica.fit(X_train)

X_ica = ica.transform(X_train)

[1]	cv_agg's auc: 0.65546 + 0.00623481
[2]	cv_agg's auc: 0.668331 + 0.00744561
[3]	cv_agg's auc: 0.674477 + 0.007533
[4]	cv_agg's auc: 0.678299 + 0.00503691
[5]	cv_agg's auc: 0.68164 + 0.00612177
[6]	cv_agg's auc: 0.683012 + 0.00457401
[7]	cv_agg's auc: 0.684031 + 0.00340935
[8]	cv_agg's auc: 0.685539 + 0.00413764
[9]	cv_agg's auc: 0.686781 + 0.0034678
[10]	cv_agg's auc: 0.686798 + 0.00309362
[11]	cv_agg's auc: 0.690438 + 0.00266695
[12]	cv_agg's auc: 0.69165 + 0.00174066
[13]	cv_agg's auc: 0.69171 + 0.00123709
[14]	cv_agg's auc: 0.692274 + 0.00174462
[15]	cv_agg's auc: 0.692181 + 0.00129675
[16]	cv_agg's auc: 0.691997 + 0.00111346
[17]	cv_agg's auc: 0.692957 + 0.00160143
[18]	cv_agg's auc: 0.693502 + 0.00159222
[19]	cv_agg's auc: 0.694253 + 0.00236264
[20]	cv_agg's auc: 0.694471 + 0.00278426
[21]	cv_agg's auc: 0.695368 + 0.0036534
[22]	cv_agg's auc: 0.695472 + 0.00368761
[23]	cv_agg's auc: 0.695788 + 0.00399987
[24]	cv_agg's auc: 0.696456 + 0.00361307
[25]	cv_agg's auc: 0.696646 + 0.0

[201]	cv_agg's auc: 0.720354 + 0.000466225
[202]	cv_agg's auc: 0.720421 + 0.000458237
[203]	cv_agg's auc: 0.720454 + 0.000485055
[204]	cv_agg's auc: 0.720536 + 0.000541882
[205]	cv_agg's auc: 0.7207 + 0.000456016
[206]	cv_agg's auc: 0.720716 + 0.000364511
[207]	cv_agg's auc: 0.720847 + 0.000353926
[208]	cv_agg's auc: 0.720985 + 0.000400605
[209]	cv_agg's auc: 0.72114 + 0.000383378
[210]	cv_agg's auc: 0.721224 + 0.000421902
[211]	cv_agg's auc: 0.72127 + 0.000391738
[212]	cv_agg's auc: 0.721336 + 0.000365237
[213]	cv_agg's auc: 0.721446 + 0.000314328
[214]	cv_agg's auc: 0.72156 + 0.000439755
[215]	cv_agg's auc: 0.721654 + 0.000456811
[216]	cv_agg's auc: 0.721801 + 0.000596811
[217]	cv_agg's auc: 0.72185 + 0.000633296
[218]	cv_agg's auc: 0.721929 + 0.00071503
[219]	cv_agg's auc: 0.722171 + 0.00062096
[220]	cv_agg's auc: 0.722225 + 0.000648976
[221]	cv_agg's auc: 0.722341 + 0.000592988
[222]	cv_agg's auc: 0.722429 + 0.000600432
[223]	cv_agg's auc: 0.722593 + 0.000668499
[224]	cv_agg's auc:

[394]	cv_agg's auc: 0.732808 + 0.000544227
[395]	cv_agg's auc: 0.732823 + 0.000524417
[396]	cv_agg's auc: 0.732826 + 0.000543145
[397]	cv_agg's auc: 0.732814 + 0.00057731
[398]	cv_agg's auc: 0.73285 + 0.000614901
[399]	cv_agg's auc: 0.73286 + 0.000577565
[400]	cv_agg's auc: 0.732843 + 0.000580556
[401]	cv_agg's auc: 0.732883 + 0.000605386
[402]	cv_agg's auc: 0.73294 + 0.000682479
[403]	cv_agg's auc: 0.732977 + 0.000711468
[404]	cv_agg's auc: 0.73305 + 0.000707516
[405]	cv_agg's auc: 0.733088 + 0.000725592
[406]	cv_agg's auc: 0.733127 + 0.000735278
[407]	cv_agg's auc: 0.73321 + 0.000799844
[408]	cv_agg's auc: 0.733252 + 0.000788461
[409]	cv_agg's auc: 0.733241 + 0.000821979
[410]	cv_agg's auc: 0.733229 + 0.000852149
[411]	cv_agg's auc: 0.733277 + 0.000842751
[412]	cv_agg's auc: 0.733343 + 0.000820608
[413]	cv_agg's auc: 0.733348 + 0.000863159
[414]	cv_agg's auc: 0.733392 + 0.000836867
[415]	cv_agg's auc: 0.733372 + 0.000848109
[416]	cv_agg's auc: 0.733424 + 0.000901594
[417]	cv_agg's au

In [None]:
%%time
print(built_in_validation(X_ica, Y_train))

## Clustering
- hdbscan: 0.7426232509930895
- kmeans: 0.7426486398702922
- tsne: 0.7424163543937508

#### hdbscan

In [10]:
from hdbscan import HDBSCAN

In [11]:
??HDBSCAN

In [20]:
%%time
hdbscan = HDBSCAN(min_cluster_size=100, 
                  gen_min_span_tree=True,
                  cluster_selection_method='leaf', # 'eom'
                )
hdbscan_clusters = hdbscan.fit_predict(X_train)

CPU times: user 8min 1s, sys: 854 ms, total: 8min 2s
Wall time: 8min 1s


In [21]:
pd.DataFrame(hdbscan_clusters)[0].value_counts()

-1    23050
 0     4392
 1     1459
 2      821
 3      778
Name: 0, dtype: int64

In [22]:
temp_x_train = X_train.iloc[:]
temp_x_train['hdbscan_class'] = hdbscan_clusters

built_in_validation(temp_x_train, Y_train)

[1]	cv_agg's auc: 0.678926 + 0.00429698
[2]	cv_agg's auc: 0.696123 + 0.00197918
[3]	cv_agg's auc: 0.69965 + 0.00155806
[4]	cv_agg's auc: 0.701922 + 0.00269514
[5]	cv_agg's auc: 0.702153 + 0.00255402
[6]	cv_agg's auc: 0.703085 + 0.00203998
[7]	cv_agg's auc: 0.704136 + 0.00155081
[8]	cv_agg's auc: 0.704732 + 0.00201528
[9]	cv_agg's auc: 0.705246 + 0.00106725
[10]	cv_agg's auc: 0.705891 + 0.000925521
[11]	cv_agg's auc: 0.706189 + 0.000554291
[12]	cv_agg's auc: 0.707076 + 0.000665564
[13]	cv_agg's auc: 0.708603 + 0.00129004
[14]	cv_agg's auc: 0.708937 + 0.00145589
[15]	cv_agg's auc: 0.709415 + 0.00157124
[16]	cv_agg's auc: 0.709714 + 0.00196727
[17]	cv_agg's auc: 0.709867 + 0.00182328
[18]	cv_agg's auc: 0.70972 + 0.00158076
[19]	cv_agg's auc: 0.709672 + 0.00148612
[20]	cv_agg's auc: 0.709733 + 0.00195578
[21]	cv_agg's auc: 0.7094 + 0.00183158
[22]	cv_agg's auc: 0.709915 + 0.00147702
[23]	cv_agg's auc: 0.710398 + 0.00144298
[24]	cv_agg's auc: 0.710277 + 0.00155685
[25]	cv_agg's auc: 0.71047

[202]	cv_agg's auc: 0.729089 + 0.00210902
[203]	cv_agg's auc: 0.72916 + 0.00207851
[204]	cv_agg's auc: 0.729214 + 0.00203006
[205]	cv_agg's auc: 0.729312 + 0.0019995
[206]	cv_agg's auc: 0.729346 + 0.00195967
[207]	cv_agg's auc: 0.72944 + 0.0020166
[208]	cv_agg's auc: 0.729515 + 0.00202485
[209]	cv_agg's auc: 0.729564 + 0.00200126
[210]	cv_agg's auc: 0.729674 + 0.0020562
[211]	cv_agg's auc: 0.729702 + 0.00201805
[212]	cv_agg's auc: 0.729807 + 0.00200927
[213]	cv_agg's auc: 0.729863 + 0.00199316
[214]	cv_agg's auc: 0.72995 + 0.00200101
[215]	cv_agg's auc: 0.729994 + 0.00192752
[216]	cv_agg's auc: 0.730029 + 0.0019307
[217]	cv_agg's auc: 0.730028 + 0.00195094
[218]	cv_agg's auc: 0.730111 + 0.00197564
[219]	cv_agg's auc: 0.730215 + 0.0019839
[220]	cv_agg's auc: 0.730338 + 0.00196554
[221]	cv_agg's auc: 0.730362 + 0.00193466
[222]	cv_agg's auc: 0.730401 + 0.0019501
[223]	cv_agg's auc: 0.730361 + 0.00199229
[224]	cv_agg's auc: 0.73039 + 0.00197611
[225]	cv_agg's auc: 0.730492 + 0.00193191
[2

[400]	cv_agg's auc: 0.737947 + 0.0023259
[401]	cv_agg's auc: 0.737962 + 0.00233336
[402]	cv_agg's auc: 0.737977 + 0.0023299
[403]	cv_agg's auc: 0.737999 + 0.00230955
[404]	cv_agg's auc: 0.738014 + 0.00230289
[405]	cv_agg's auc: 0.738049 + 0.00228595
[406]	cv_agg's auc: 0.738093 + 0.00226667
[407]	cv_agg's auc: 0.738129 + 0.00229538
[408]	cv_agg's auc: 0.738152 + 0.00227927
[409]	cv_agg's auc: 0.738209 + 0.00226451
[410]	cv_agg's auc: 0.738241 + 0.00223357
[411]	cv_agg's auc: 0.738255 + 0.00225386
[412]	cv_agg's auc: 0.738275 + 0.00223952
[413]	cv_agg's auc: 0.738294 + 0.00224405
[414]	cv_agg's auc: 0.738317 + 0.00226748
[415]	cv_agg's auc: 0.738287 + 0.00225775
[416]	cv_agg's auc: 0.738305 + 0.00225154
[417]	cv_agg's auc: 0.73835 + 0.00225889
[418]	cv_agg's auc: 0.738387 + 0.00224752
[419]	cv_agg's auc: 0.738434 + 0.00224456
[420]	cv_agg's auc: 0.738429 + 0.00225057
[421]	cv_agg's auc: 0.738442 + 0.00224372
[422]	cv_agg's auc: 0.738502 + 0.00224465
[423]	cv_agg's auc: 0.738488 + 0.0022

0.7405649376384926

#### KMeans

In [35]:
from sklearn.cluster import KMeans

In [44]:
kmeans = KMeans(n_clusters=3,
                random_state=42,
               )
kmeans.fit(X_train)

train_clusters = kmeans.predict(X_train)
# test_clusters = kmeans.predict(X_test)

In [45]:
temp_x_train = X_train.iloc[:]
temp_x_train['kmeans_class'] = train_clusters

built_in_validation(temp_x_train, Y_train)

[1]	cv_agg's auc: 0.678926 + 0.00429698
[2]	cv_agg's auc: 0.696123 + 0.00197918
[3]	cv_agg's auc: 0.69965 + 0.00155806
[4]	cv_agg's auc: 0.701922 + 0.00269514
[5]	cv_agg's auc: 0.702153 + 0.00255402
[6]	cv_agg's auc: 0.703085 + 0.00203998
[7]	cv_agg's auc: 0.704136 + 0.00155081
[8]	cv_agg's auc: 0.704732 + 0.00201528
[9]	cv_agg's auc: 0.705246 + 0.00106725
[10]	cv_agg's auc: 0.705891 + 0.000925521
[11]	cv_agg's auc: 0.706189 + 0.000554291
[12]	cv_agg's auc: 0.707076 + 0.000665564
[13]	cv_agg's auc: 0.708603 + 0.00129004
[14]	cv_agg's auc: 0.708937 + 0.00145589
[15]	cv_agg's auc: 0.709415 + 0.00157124
[16]	cv_agg's auc: 0.709714 + 0.00196727
[17]	cv_agg's auc: 0.709867 + 0.00182328
[18]	cv_agg's auc: 0.70972 + 0.00158076
[19]	cv_agg's auc: 0.709672 + 0.00148612
[20]	cv_agg's auc: 0.709733 + 0.00195578
[21]	cv_agg's auc: 0.7094 + 0.00183158
[22]	cv_agg's auc: 0.709915 + 0.00147702
[23]	cv_agg's auc: 0.710398 + 0.00144298
[24]	cv_agg's auc: 0.710277 + 0.00155685
[25]	cv_agg's auc: 0.71047

[201]	cv_agg's auc: 0.728919 + 0.00197582
[202]	cv_agg's auc: 0.729036 + 0.00200416
[203]	cv_agg's auc: 0.729051 + 0.00197551
[204]	cv_agg's auc: 0.729094 + 0.00193134
[205]	cv_agg's auc: 0.729157 + 0.00192997
[206]	cv_agg's auc: 0.729189 + 0.00189364
[207]	cv_agg's auc: 0.729278 + 0.00193088
[208]	cv_agg's auc: 0.729343 + 0.00192997
[209]	cv_agg's auc: 0.729378 + 0.00188453
[210]	cv_agg's auc: 0.729486 + 0.00193416
[211]	cv_agg's auc: 0.729541 + 0.00192916
[212]	cv_agg's auc: 0.729637 + 0.00193516
[213]	cv_agg's auc: 0.729695 + 0.00192038
[214]	cv_agg's auc: 0.72978 + 0.00192208
[215]	cv_agg's auc: 0.7298 + 0.00187272
[216]	cv_agg's auc: 0.729801 + 0.00189166
[217]	cv_agg's auc: 0.7298 + 0.00190066
[218]	cv_agg's auc: 0.729872 + 0.00191161
[219]	cv_agg's auc: 0.729986 + 0.00195254
[220]	cv_agg's auc: 0.7301 + 0.00194118
[221]	cv_agg's auc: 0.730135 + 0.00191338
[222]	cv_agg's auc: 0.730178 + 0.00193736
[223]	cv_agg's auc: 0.73016 + 0.00196996
[224]	cv_agg's auc: 0.730203 + 0.00194713


[401]	cv_agg's auc: 0.737803 + 0.00229176
[402]	cv_agg's auc: 0.73783 + 0.00230045
[403]	cv_agg's auc: 0.737864 + 0.0022666
[404]	cv_agg's auc: 0.737876 + 0.00226842
[405]	cv_agg's auc: 0.737914 + 0.00226924
[406]	cv_agg's auc: 0.737952 + 0.00223519
[407]	cv_agg's auc: 0.737975 + 0.0022441
[408]	cv_agg's auc: 0.737959 + 0.00226322
[409]	cv_agg's auc: 0.737968 + 0.0022408
[410]	cv_agg's auc: 0.737988 + 0.00221136
[411]	cv_agg's auc: 0.738013 + 0.00223448
[412]	cv_agg's auc: 0.738045 + 0.00221524
[413]	cv_agg's auc: 0.738087 + 0.00220872
[414]	cv_agg's auc: 0.738116 + 0.00223898
[415]	cv_agg's auc: 0.738096 + 0.0022169
[416]	cv_agg's auc: 0.73811 + 0.00217334
[417]	cv_agg's auc: 0.738156 + 0.00218506
[418]	cv_agg's auc: 0.738174 + 0.00214362
[419]	cv_agg's auc: 0.738228 + 0.00215428
[420]	cv_agg's auc: 0.738254 + 0.00216509
[421]	cv_agg's auc: 0.738278 + 0.00215254
[422]	cv_agg's auc: 0.738295 + 0.0021018
[423]	cv_agg's auc: 0.738244 + 0.00209243
[424]	cv_agg's auc: 0.738282 + 0.00209185

[599]	cv_agg's auc: 0.741078 + 0.00167795
[600]	cv_agg's auc: 0.741107 + 0.00168376
[601]	cv_agg's auc: 0.741101 + 0.00168993
[602]	cv_agg's auc: 0.7411 + 0.00171529
[603]	cv_agg's auc: 0.741097 + 0.0016912
[604]	cv_agg's auc: 0.741126 + 0.00170478
[605]	cv_agg's auc: 0.741143 + 0.00172474
[606]	cv_agg's auc: 0.741151 + 0.00174654
[607]	cv_agg's auc: 0.741173 + 0.00174991
[608]	cv_agg's auc: 0.741182 + 0.0017307
[609]	cv_agg's auc: 0.741198 + 0.00172382
[610]	cv_agg's auc: 0.741207 + 0.00170242
[611]	cv_agg's auc: 0.741209 + 0.00173775
[612]	cv_agg's auc: 0.741214 + 0.00176259
[613]	cv_agg's auc: 0.741214 + 0.001747
[614]	cv_agg's auc: 0.741233 + 0.00178681
[615]	cv_agg's auc: 0.741233 + 0.00179552
[616]	cv_agg's auc: 0.741271 + 0.00178903
[617]	cv_agg's auc: 0.741299 + 0.00181393
[618]	cv_agg's auc: 0.741314 + 0.00181789
[619]	cv_agg's auc: 0.741335 + 0.0017927
[620]	cv_agg's auc: 0.741344 + 0.00179679
[621]	cv_agg's auc: 0.741346 + 0.00177834
[622]	cv_agg's auc: 0.741358 + 0.0018146


[800]	cv_agg's auc: 0.742636 + 0.00198244
[801]	cv_agg's auc: 0.742637 + 0.0019934
[802]	cv_agg's auc: 0.742622 + 0.00199154
[803]	cv_agg's auc: 0.742612 + 0.00201549


0.7426486398702922

#### TSNE

In [17]:
%%time
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, 
            verbose=1, 
            perplexity=40, 
            n_iter=300
           )

tsne_clusters = tsne.fit_transform(X_train)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 30500 samples in 3.591s...
[t-SNE] Computed neighbors for 30500 samples in 444.131s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30500
[t-SNE] Computed conditional probabilities for sample 2000 / 30500
[t-SNE] Computed conditional probabilities for sample 3000 / 30500
[t-SNE] Computed conditional probabilities for sample 4000 / 30500
[t-SNE] Computed conditional probabilities for sample 5000 / 30500
[t-SNE] Computed conditional probabilities for sample 6000 / 30500
[t-SNE] Computed conditional probabilities for sample 7000 / 30500
[t-SNE] Computed conditional probabilities for sample 8000 / 30500
[t-SNE] Computed conditional probabilities for sample 9000 / 30500
[t-SNE] Computed conditional probabilities for sample 10000 / 30500
[t-SNE] Computed conditional probabilities for sample 11000 / 30500
[t-SNE] Computed conditional probabilities for sample 12000 / 30500
[t-SNE] Computed conditional probabilities for 

In [19]:
temp_x_train = X_train.iloc[:]
temp_x_train['tsne_class'] = hdbscan_clusters

built_in_validation(temp_x_train, Y_train)

[1]	cv_agg's auc: 0.678926 + 0.00429698
[2]	cv_agg's auc: 0.696123 + 0.00197918
[3]	cv_agg's auc: 0.69965 + 0.00155806
[4]	cv_agg's auc: 0.701922 + 0.00269514
[5]	cv_agg's auc: 0.702196 + 0.0025568
[6]	cv_agg's auc: 0.703135 + 0.00204914
[7]	cv_agg's auc: 0.704174 + 0.00156383
[8]	cv_agg's auc: 0.704771 + 0.00201679
[9]	cv_agg's auc: 0.70528 + 0.00106755
[10]	cv_agg's auc: 0.705926 + 0.000948532
[11]	cv_agg's auc: 0.706234 + 0.000549555
[12]	cv_agg's auc: 0.707101 + 0.000638917
[13]	cv_agg's auc: 0.708633 + 0.00125653
[14]	cv_agg's auc: 0.708975 + 0.00141397
[15]	cv_agg's auc: 0.709461 + 0.00153075
[16]	cv_agg's auc: 0.709749 + 0.00194053
[17]	cv_agg's auc: 0.709912 + 0.00179447
[18]	cv_agg's auc: 0.709771 + 0.00156834
[19]	cv_agg's auc: 0.709728 + 0.00147865
[20]	cv_agg's auc: 0.709765 + 0.00195533
[21]	cv_agg's auc: 0.709432 + 0.00183
[22]	cv_agg's auc: 0.709944 + 0.00146316
[23]	cv_agg's auc: 0.710532 + 0.0015885
[24]	cv_agg's auc: 0.710387 + 0.00166936
[25]	cv_agg's auc: 0.710573 +

[200]	cv_agg's auc: 0.728769 + 0.0020082
[201]	cv_agg's auc: 0.728842 + 0.00203458
[202]	cv_agg's auc: 0.728951 + 0.00205052
[203]	cv_agg's auc: 0.729047 + 0.00200161
[204]	cv_agg's auc: 0.729099 + 0.00194938
[205]	cv_agg's auc: 0.729216 + 0.00192481
[206]	cv_agg's auc: 0.729269 + 0.00188768
[207]	cv_agg's auc: 0.72937 + 0.00195243
[208]	cv_agg's auc: 0.729434 + 0.00195311
[209]	cv_agg's auc: 0.729486 + 0.00193052
[210]	cv_agg's auc: 0.729591 + 0.00198622
[211]	cv_agg's auc: 0.72965 + 0.00200044
[212]	cv_agg's auc: 0.72976 + 0.00200936
[213]	cv_agg's auc: 0.729818 + 0.00199088
[214]	cv_agg's auc: 0.729899 + 0.00200342
[215]	cv_agg's auc: 0.729951 + 0.00191505
[216]	cv_agg's auc: 0.729968 + 0.00191636
[217]	cv_agg's auc: 0.729964 + 0.00193842
[218]	cv_agg's auc: 0.730047 + 0.00195797
[219]	cv_agg's auc: 0.730138 + 0.00198095
[220]	cv_agg's auc: 0.73025 + 0.00197059
[221]	cv_agg's auc: 0.730287 + 0.0019309
[222]	cv_agg's auc: 0.730343 + 0.0019474
[223]	cv_agg's auc: 0.730329 + 0.00197965

[397]	cv_agg's auc: 0.738016 + 0.00236969
[398]	cv_agg's auc: 0.738028 + 0.0023858
[399]	cv_agg's auc: 0.737989 + 0.00232578
[400]	cv_agg's auc: 0.737994 + 0.00229355
[401]	cv_agg's auc: 0.73802 + 0.00229955
[402]	cv_agg's auc: 0.738036 + 0.00229164
[403]	cv_agg's auc: 0.738053 + 0.00226975
[404]	cv_agg's auc: 0.738053 + 0.0022622
[405]	cv_agg's auc: 0.738089 + 0.00226633
[406]	cv_agg's auc: 0.738145 + 0.00223918
[407]	cv_agg's auc: 0.738182 + 0.00223507
[408]	cv_agg's auc: 0.738194 + 0.00222458
[409]	cv_agg's auc: 0.738244 + 0.00221123
[410]	cv_agg's auc: 0.738251 + 0.00219346
[411]	cv_agg's auc: 0.738268 + 0.00222739
[412]	cv_agg's auc: 0.738269 + 0.00223244
[413]	cv_agg's auc: 0.738276 + 0.00224333
[414]	cv_agg's auc: 0.738279 + 0.00226372
[415]	cv_agg's auc: 0.738268 + 0.002253
[416]	cv_agg's auc: 0.738262 + 0.00221814
[417]	cv_agg's auc: 0.738311 + 0.00222958
[418]	cv_agg's auc: 0.738328 + 0.00219924
[419]	cv_agg's auc: 0.738378 + 0.00220617
[420]	cv_agg's auc: 0.73838 + 0.0022251

[597]	cv_agg's auc: 0.741368 + 0.00187403
[598]	cv_agg's auc: 0.741378 + 0.00189012
[599]	cv_agg's auc: 0.74143 + 0.00189501
[600]	cv_agg's auc: 0.741445 + 0.00190571
[601]	cv_agg's auc: 0.741447 + 0.00188884
[602]	cv_agg's auc: 0.741452 + 0.00192465
[603]	cv_agg's auc: 0.741462 + 0.00190981
[604]	cv_agg's auc: 0.741505 + 0.00192061
[605]	cv_agg's auc: 0.741488 + 0.0019258
[606]	cv_agg's auc: 0.741512 + 0.00193126
[607]	cv_agg's auc: 0.74153 + 0.00195807
[608]	cv_agg's auc: 0.741518 + 0.00197
[609]	cv_agg's auc: 0.741516 + 0.00196706
[610]	cv_agg's auc: 0.741531 + 0.00194948
[611]	cv_agg's auc: 0.741528 + 0.00194557
[612]	cv_agg's auc: 0.741511 + 0.00195233
[613]	cv_agg's auc: 0.741547 + 0.00195845
[614]	cv_agg's auc: 0.741575 + 0.00199311
[615]	cv_agg's auc: 0.741584 + 0.00199474
[616]	cv_agg's auc: 0.741621 + 0.00200839
[617]	cv_agg's auc: 0.741646 + 0.00201859
[618]	cv_agg's auc: 0.741669 + 0.00200531
[619]	cv_agg's auc: 0.741674 + 0.0019746
[620]	cv_agg's auc: 0.741683 + 0.00198871

0.7424163543937508

## Feature Selection
- SelectKBest: 0.7404950613133826
- - 0.75200698
- VarianceThreshold: 0.7419278882415092
- - LB: 0.75057846
- SelectFromModel: 0.6998173137620546

In [28]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

#### SelectKBest

In [98]:
skb = SelectKBest(f_classif, k=180)

In [99]:
x_data_kbest = skb.fit_transform(X_train, Y_train)

In [48]:
built_in_validation(x_data_kbest, Y_train)

[1]	cv_agg's auc: 0.682237 + 0.00229805
[2]	cv_agg's auc: 0.689263 + 0.00137437
[3]	cv_agg's auc: 0.696104 + 0.00315881
[4]	cv_agg's auc: 0.700077 + 0.00240652
[5]	cv_agg's auc: 0.700724 + 0.00195799
[6]	cv_agg's auc: 0.702575 + 0.00211721
[7]	cv_agg's auc: 0.703109 + 0.000883239
[8]	cv_agg's auc: 0.704196 + 0.0015095
[9]	cv_agg's auc: 0.705502 + 0.00188894
[10]	cv_agg's auc: 0.707882 + 0.00147308
[11]	cv_agg's auc: 0.708117 + 0.0018124
[12]	cv_agg's auc: 0.707576 + 0.00133753
[13]	cv_agg's auc: 0.708144 + 0.00137898
[14]	cv_agg's auc: 0.708243 + 0.00148515
[15]	cv_agg's auc: 0.708403 + 0.00195696
[16]	cv_agg's auc: 0.708875 + 0.0023925
[17]	cv_agg's auc: 0.709016 + 0.0017339
[18]	cv_agg's auc: 0.708762 + 0.00182991
[19]	cv_agg's auc: 0.709261 + 0.00205313
[20]	cv_agg's auc: 0.709422 + 0.00232701
[21]	cv_agg's auc: 0.709523 + 0.00239522
[22]	cv_agg's auc: 0.709576 + 0.00235374
[23]	cv_agg's auc: 0.709887 + 0.00188196
[24]	cv_agg's auc: 0.710463 + 0.00216878
[25]	cv_agg's auc: 0.710502 

[200]	cv_agg's auc: 0.728596 + 0.0022346
[201]	cv_agg's auc: 0.728681 + 0.00225565
[202]	cv_agg's auc: 0.728755 + 0.00226973
[203]	cv_agg's auc: 0.728815 + 0.00233382
[204]	cv_agg's auc: 0.728897 + 0.00232316
[205]	cv_agg's auc: 0.728933 + 0.00224945
[206]	cv_agg's auc: 0.729012 + 0.00226624
[207]	cv_agg's auc: 0.729105 + 0.00226712
[208]	cv_agg's auc: 0.729106 + 0.00226644
[209]	cv_agg's auc: 0.729162 + 0.00220662
[210]	cv_agg's auc: 0.729147 + 0.00218949
[211]	cv_agg's auc: 0.729251 + 0.00223437
[212]	cv_agg's auc: 0.729343 + 0.00224767
[213]	cv_agg's auc: 0.729506 + 0.00222554
[214]	cv_agg's auc: 0.729503 + 0.0022141
[215]	cv_agg's auc: 0.729537 + 0.0021738
[216]	cv_agg's auc: 0.729627 + 0.00213643
[217]	cv_agg's auc: 0.729734 + 0.00210447
[218]	cv_agg's auc: 0.72985 + 0.00208517
[219]	cv_agg's auc: 0.729908 + 0.00208791
[220]	cv_agg's auc: 0.729931 + 0.00211627
[221]	cv_agg's auc: 0.729982 + 0.00209662
[222]	cv_agg's auc: 0.730027 + 0.00210242
[223]	cv_agg's auc: 0.730055 + 0.00214

[398]	cv_agg's auc: 0.737726 + 0.00194354
[399]	cv_agg's auc: 0.737755 + 0.00196865
[400]	cv_agg's auc: 0.737776 + 0.00195018
[401]	cv_agg's auc: 0.737773 + 0.00195514
[402]	cv_agg's auc: 0.737798 + 0.00197836
[403]	cv_agg's auc: 0.737796 + 0.00193503
[404]	cv_agg's auc: 0.737842 + 0.00189491
[405]	cv_agg's auc: 0.737846 + 0.00191835
[406]	cv_agg's auc: 0.737863 + 0.0019325
[407]	cv_agg's auc: 0.73788 + 0.00194726
[408]	cv_agg's auc: 0.737898 + 0.00196477
[409]	cv_agg's auc: 0.737948 + 0.00200073
[410]	cv_agg's auc: 0.737991 + 0.00198415
[411]	cv_agg's auc: 0.73802 + 0.00198475
[412]	cv_agg's auc: 0.738039 + 0.00197735
[413]	cv_agg's auc: 0.738045 + 0.00194999
[414]	cv_agg's auc: 0.738079 + 0.00196267
[415]	cv_agg's auc: 0.738097 + 0.00191563
[416]	cv_agg's auc: 0.738063 + 0.00190999
[417]	cv_agg's auc: 0.738047 + 0.00193148
[418]	cv_agg's auc: 0.738045 + 0.00192372
[419]	cv_agg's auc: 0.738064 + 0.0019034
[420]	cv_agg's auc: 0.738105 + 0.00188707
[421]	cv_agg's auc: 0.738121 + 0.00188

[595]	cv_agg's auc: 0.741071 + 0.00174009
[596]	cv_agg's auc: 0.741077 + 0.00173987
[597]	cv_agg's auc: 0.741121 + 0.0017417
[598]	cv_agg's auc: 0.741122 + 0.00174767
[599]	cv_agg's auc: 0.741127 + 0.00174434
[600]	cv_agg's auc: 0.741135 + 0.00173653
[601]	cv_agg's auc: 0.741133 + 0.00172619
[602]	cv_agg's auc: 0.741125 + 0.00173584
[603]	cv_agg's auc: 0.741166 + 0.00170257
[604]	cv_agg's auc: 0.74119 + 0.00167799
[605]	cv_agg's auc: 0.741171 + 0.00163877
[606]	cv_agg's auc: 0.741164 + 0.00163608
[607]	cv_agg's auc: 0.741189 + 0.00164998
[608]	cv_agg's auc: 0.741192 + 0.00166217
[609]	cv_agg's auc: 0.741236 + 0.00162676
[610]	cv_agg's auc: 0.741309 + 0.00161005
[611]	cv_agg's auc: 0.741306 + 0.00161331
[612]	cv_agg's auc: 0.741328 + 0.00162327
[613]	cv_agg's auc: 0.741316 + 0.00164862
[614]	cv_agg's auc: 0.741315 + 0.00161895
[615]	cv_agg's auc: 0.741363 + 0.00163849
[616]	cv_agg's auc: 0.741387 + 0.00163059
[617]	cv_agg's auc: 0.741374 + 0.00162128
[618]	cv_agg's auc: 0.741377 + 0.001

0.741992688326674

In [100]:
x_data_kbest_test = skb.transform(X_test)

In [101]:
submit_model(x_data_varth, Y_train, x_data_varth_test, 'sumbission7_feature_selection_selectKBest.csv', 678)

#### VarianceThreshold

In [89]:
vth = VarianceThreshold(0.03)
x_data_varth = vth.fit_transform(X_train, Y_train)

In [90]:
built_in_validation(x_data_varth, Y_train)

[1]	cv_agg's auc: 0.674445 + 0.00165465
[2]	cv_agg's auc: 0.690319 + 0.000769881
[3]	cv_agg's auc: 0.696154 + 0.000699545
[4]	cv_agg's auc: 0.698747 + 0.00155139
[5]	cv_agg's auc: 0.699398 + 0.00143307
[6]	cv_agg's auc: 0.701497 + 0.00130836
[7]	cv_agg's auc: 0.702382 + 0.000892468
[8]	cv_agg's auc: 0.702996 + 0.000169031
[9]	cv_agg's auc: 0.703754 + 0.00149977
[10]	cv_agg's auc: 0.704768 + 0.00155394
[11]	cv_agg's auc: 0.705911 + 0.00155959
[12]	cv_agg's auc: 0.705869 + 0.00110173
[13]	cv_agg's auc: 0.706555 + 0.00193215
[14]	cv_agg's auc: 0.707208 + 0.0025367
[15]	cv_agg's auc: 0.707583 + 0.00266281
[16]	cv_agg's auc: 0.708295 + 0.00268338
[17]	cv_agg's auc: 0.708426 + 0.00274034
[18]	cv_agg's auc: 0.708777 + 0.00235347
[19]	cv_agg's auc: 0.709242 + 0.00247438
[20]	cv_agg's auc: 0.709513 + 0.0026522
[21]	cv_agg's auc: 0.709322 + 0.00205845
[22]	cv_agg's auc: 0.709523 + 0.0019478
[23]	cv_agg's auc: 0.709975 + 0.00227331
[24]	cv_agg's auc: 0.710659 + 0.00199768
[25]	cv_agg's auc: 0.710

[202]	cv_agg's auc: 0.727251 + 0.00172728
[203]	cv_agg's auc: 0.727335 + 0.00170743
[204]	cv_agg's auc: 0.727439 + 0.00167263
[205]	cv_agg's auc: 0.727489 + 0.00160633
[206]	cv_agg's auc: 0.727523 + 0.00161972
[207]	cv_agg's auc: 0.727619 + 0.00165731
[208]	cv_agg's auc: 0.727659 + 0.0016292
[209]	cv_agg's auc: 0.727657 + 0.00157821
[210]	cv_agg's auc: 0.727718 + 0.00157734
[211]	cv_agg's auc: 0.727812 + 0.00155027
[212]	cv_agg's auc: 0.727899 + 0.00155843
[213]	cv_agg's auc: 0.727982 + 0.00162086
[214]	cv_agg's auc: 0.728058 + 0.00166348
[215]	cv_agg's auc: 0.728128 + 0.00167155
[216]	cv_agg's auc: 0.728192 + 0.00164335
[217]	cv_agg's auc: 0.7283 + 0.00167956
[218]	cv_agg's auc: 0.728448 + 0.00167515
[219]	cv_agg's auc: 0.72852 + 0.00168885
[220]	cv_agg's auc: 0.728655 + 0.00168162
[221]	cv_agg's auc: 0.728751 + 0.00174756
[222]	cv_agg's auc: 0.728878 + 0.00181186
[223]	cv_agg's auc: 0.728948 + 0.0018006
[224]	cv_agg's auc: 0.728948 + 0.00178948
[225]	cv_agg's auc: 0.729043 + 0.001793

[398]	cv_agg's auc: 0.736765 + 0.00231256
[399]	cv_agg's auc: 0.736765 + 0.00235965
[400]	cv_agg's auc: 0.736758 + 0.0023277
[401]	cv_agg's auc: 0.736775 + 0.00236591
[402]	cv_agg's auc: 0.736812 + 0.0024082
[403]	cv_agg's auc: 0.736802 + 0.00233431
[404]	cv_agg's auc: 0.736771 + 0.00233797
[405]	cv_agg's auc: 0.736791 + 0.00238696
[406]	cv_agg's auc: 0.736816 + 0.00243466
[407]	cv_agg's auc: 0.736815 + 0.00242992
[408]	cv_agg's auc: 0.736832 + 0.00242427
[409]	cv_agg's auc: 0.736861 + 0.00244054
[410]	cv_agg's auc: 0.736897 + 0.00240952
[411]	cv_agg's auc: 0.7369 + 0.00241896
[412]	cv_agg's auc: 0.736868 + 0.00241711
[413]	cv_agg's auc: 0.736897 + 0.0023954
[414]	cv_agg's auc: 0.736908 + 0.00239883
[415]	cv_agg's auc: 0.736873 + 0.00239975
[416]	cv_agg's auc: 0.736923 + 0.00237899
[417]	cv_agg's auc: 0.736931 + 0.00235896
[418]	cv_agg's auc: 0.736951 + 0.00234485
[419]	cv_agg's auc: 0.73701 + 0.00234575
[420]	cv_agg's auc: 0.737034 + 0.00235456
[421]	cv_agg's auc: 0.737058 + 0.0023110

0.738963146605311

In [91]:
x_data_varth_test = vth.transform(X_test)

In [92]:
submit_model(x_data_varth, Y_train, x_data_varth_test, 'sumbission7_feature_selection.csv', 544)

#### SelectFromModel

In [78]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

In [83]:
pipe = make_pipeline(
    SelectFromModel(estimator=RandomForestClassifier(random_state=42)),
    lgb.LGBMClassifier(**lgb_parameters)
)

print(
    cross_val_score(pipe, X_train, Y_train, 
                    scoring='roc_auc', cv=skf.split(X_train, Y_train)).mean()
)

0.6998173137620546


## Encoding

In [179]:
en_train = pd.read_csv(path_join(CSV_DIR, 'train.csv'), sep='\t')
en_test = pd.read_csv(path_join(CSV_DIR, 'test.csv'), sep='\t')

In [327]:
en_X_train, en_Y_train = en_train.drop(columns=['0', 'Unnamed: 0']), train['0']
en_X_test, en_Y_test = en_test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [328]:
class IterDF:
    def __init__(self, df):
        self.df = df
        self.it = -10
        
    def __iter__(self):
        iter_values = []
        
        while True:
            if self.it + 10 <= self.get_len():
                self.it += 10
            else:
                self.it += self.get_len() - self.it

            yield self.df.iloc[:, self.it:self.it+10]
            
            self.it -= 5
        
          
    def get_len(self):
        return len(self.df.columns)
    
d = iter(IterDF(en_X_train))

In [329]:
a = next(d)

values = [a[column].value_counts() for column in a.columns
          if a[column].dtype == 'int64']
for index in range(len(values)):
    print(f'----{a.columns[index]}:---- \n{values[index]}')

a.head(10)

----1:---- 
1    29826
0      674
Name: 1, dtype: int64
----2:---- 
0    29831
1      669
Name: 2, dtype: int64
----3:---- 
0    30495
1        5
Name: 3, dtype: int64
----4:---- 
0    17151
1    13349
Name: 4, dtype: int64
----5:---- 
0    30481
1       19
Name: 5, dtype: int64
----6:---- 
0    30478
1       22
Name: 6, dtype: int64
----7:---- 
0    30398
1      102
Name: 8, dtype: int64
----8:---- 
0    29831
1      669
Name: 9, dtype: int64
----9:---- 
1    28760
0     1740
Name: 10, dtype: int64


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1,0,0,0,0,0,0.090909,0,0,1
1,1,0,0,1,0,0,0.090909,0,0,1
2,1,0,0,1,0,0,0.090909,0,0,1
3,1,0,0,1,0,0,0.136364,0,0,1
4,1,0,0,1,0,0,0.136364,0,0,1
5,1,0,0,0,0,0,0.090909,0,0,0
6,1,0,0,0,0,0,0.090909,0,0,1
7,1,0,0,1,0,0,0.090909,0,0,1
8,1,0,0,0,0,0,0.090909,0,0,1
9,1,0,0,0,0,0,0.136364,0,0,1


In [330]:
def check_categorical(df, cols):
    result = sum_cols(df, cols)
    if len(result.value_counts()) == 1:
        return True
    return False

def sum_cols(df, cols):
    if len(cols) == 1:
        return df[cols.pop()]
    col = cols.pop()
    return df[col] + sum_cols(df, cols)

In [331]:
print(check_categorical(en_X_train, ['1', '2', '3']))
print(check_categorical(en_X_test, ['1', '2', '3']))
print(check_categorical(en_X_train, ['8', '9', '10']))
print(check_categorical(en_X_train, list(map(str, range(33, 60)))))

True
True
False
False


In [333]:
def get_categorie(df):
    seria = np.array(list(range(len(df))))
                     
    for index, values in enumerate(df.values):
        if values[0] == 1:
            seria[index] = 1
        if values[1] == 1:
            seria[index] = 2
        if values[2] == 1:
            seria[index] = 3
    return seria

new_feature = get_categorie(en_X_train[['1', '2', '3']])
en_X_train['cat_feat'] = new_feature


new_feature = get_categorie(en_X_test[['1', '2', '3']])
en_X_test['cat_feat'] = new_feature

en_X_train = en_X_train.drop(columns=['1', '2', '3'])
en_X_test = en_X_test.drop(columns=['1', '2', '3'])

In [336]:
en_X_train.head()

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,...,337,338,339,340,341,342,343,344,345,cat_feat
0,0,0,0,0.090909,0,0,1,1,1,0.461538,...,0,1,0,0,0.222222,1,1,1,1,1
1,1,0,0,0.090909,0,0,1,1,1,0.1875,...,0,1,0,0,0.111111,1,1,1,0,1
2,1,0,0,0.090909,0,0,1,1,1,0.2,...,0,1,0,0,0.444444,1,1,1,1,1
3,1,0,0,0.136364,0,0,1,1,1,0.0,...,0,0,1,0,0.222222,1,1,1,0,1
4,1,0,0,0.136364,0,0,1,1,1,0.65,...,0,0,1,0,0.111111,1,1,1,1,1


In [338]:
# by Stas Semenov
import time

class SemenovEncoding:
    
    def __init__(self, C=10):
        self.C = C
        self.cpu_k = 3
        self.global_mean = 0
        self.features = 'all'
        self.cat_columns = []
        self.y = 0
        self.values = dict()

    def fit(self, data, y, features='all'):

        self.y = y
        
        if features == 'all':
            self.cat_columns = sorted([i for i in data.columns if data[i].dtype == 'O'])
            self.features = self.cat_columns
        else:
            self.features = features

        self.global_mean = np.mean(y)

        f = {'y': ['size', 'mean']}

        for col in self.features:
            self.values[col] = dict()
            temp = pd.DataFrame({'y': y, col: data[col]}).groupby([col]).agg(f)

            self.values[col] = (
                (temp['y']['mean'] * temp['y']['size'] + self.global_mean * self.C) / 
                (temp['y']['size'] + self.C)
            ).to_dict()
            
        return self.values

    def fit_transform(self, data, y, features='all', inplace=True):
        
        self.fit(data, y, features)
        return self.transform(data, inplace=inplace)

    def transform(self, data, inplace=True):
        
        if inplace:
            for col in self.values:
                if col in data.columns:
                    temp = pd.DataFrame.from_dict(
                        self.values[col], orient='index').reset_index()
                    temp.columns = [col, 'value']
                    data = pd.merge(data, temp, how='left').fillna(self.global_mean)
                    data[col] = data['value']
                    del data['value']
                    data[col] = data[col].astype('float32')

                else:
                    warnings.warn('Column ' + col + ' is missed in this dataset.')
        else:
            new_data = data.copy()
            for col in self.values:
                if col in new_data.columns:
                    temp = pd.DataFrame.from_dict(
                        self.values[col], orient='index').reset_index()
                    temp.columns = [col, 'value']
                    new_data = pd.merge(
                        new_data, temp, how='left').fillna(self.global_mean)
                    new_data[col] = new_data['value']
                    del new_data['value']
                    new_data[col] = new_data[col].astype('float32')

                else:
                    warnings.warn('Column ' + col + ' is missed in this dataset.')
            return new_data

In [13]:
def create_new_df_with_categorical_encodings(new_train, new_train_y, new_val, cols):
    se = SemenovEncoding()
    new_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    val_dfs = []
    # 2)
    for new_train_split, new_val_split in new_skf.split(new_train, new_train_y):
        # 3)
        se.fit(
            new_train.iloc[new_train_split], 
            new_train_y.iloc[new_train_split], 
            features=cols
        )
        val_dfs.append(
            se.transform(new_train.iloc[new_val_split], inplace=False)
        )
    # 4)
    se.fit(new_train, new_train_y, features=cols)
    main_val = se.transform(new_val, inplace=False)
    return val_dfs, main_val

In [386]:
%%time

cols_to_encode = 'cat_feat'
new_train_dfs = []
new_val_dfs = []
main_train_dfs = []

for train_split, val_split in skf.split(en_X_train, en_Y_train): 
    # 5)
    temp_train_dfs, temp_val_df = create_new_df_with_categorical_encodings(
        en_X_train.iloc[train_split], 
        en_Y_train.iloc[train_split], 
        en_X_train.iloc[val_split], 
        [cols_to_encode]
    )
    # 6)
    new_train_dfs.append(temp_train_dfs)
    new_val_dfs.append(temp_val_df)
    # 7)
    se = SemenovEncoding()
    se.fit(en_X_train.iloc[train_split], en_Y_train.iloc[train_split], [cols_to_encode])
    main_train_dfs.append(
        se.transform(en_X_train.iloc[val_split], inplace=False)
    )
    
se.fit(en_X_train, en_Y_train, features=[cols_to_encode])
main_test = se.transform(en_X_test, inplace=False)

CPU times: user 2.08 s, sys: 172 ms, total: 2.26 s
Wall time: 2.44 s


In [372]:
len(new_val_dfs[0])

10167

In [373]:
len(main_train_dfs[0])

10167

In [387]:
# abc = [new_val_dfs[0], new_val_dfs[1], new_val_dfs[2]]
abc = [main_train_dfs[0], main_train_dfs[1], main_train_dfs[2]]
main_train = pd.concat([i for i in abc], axis=0).reset_index()
main_train.shape

(30500, 344)

In [388]:
built_in_validation(main_train, Y_train)

[1]	cv_agg's auc: 0.504528 + 0.00114628
[2]	cv_agg's auc: 0.50112 + 0.000448236
[3]	cv_agg's auc: 0.501092 + 0.00548751
[4]	cv_agg's auc: 0.500922 + 0.00376059
[5]	cv_agg's auc: 0.500124 + 0.00359239
[6]	cv_agg's auc: 0.501041 + 0.00347441
[7]	cv_agg's auc: 0.497841 + 0.00285686
[8]	cv_agg's auc: 0.49905 + 0.00261365
[9]	cv_agg's auc: 0.499629 + 0.00261052
[10]	cv_agg's auc: 0.500458 + 0.00306823
[11]	cv_agg's auc: 0.497729 + 0.00369039


0.504527724967205

## xgbfir
- Baseline
- - 0.740913
- LB
- - 0.74047620
- xgbfir
- - 0.741162
- LB:
- - 0.73037394

In [93]:
import xgbfir
import xgboost as xgb

In [136]:
def submit_model_xgb(x_train, y_train, x_test, file_name, n_estimators=900):
    params_xgb['n_estimators'] = n_estimators
    xgb_model = xgb.XGBClassifier(**params_xgb)
    del params_xgb['n_estimators']

    
    xgb_model.fit(x_train, y_train)

    xgb_prediction = xgb_model.predict_proba(x_test)[:, 1]
    
    write_pred(file_name, xgb_prediction)

In [97]:
train_cols = [col for col in X_train.columns if X_train[col].dtype != 'O']

params_xgb = {
   
    'objective': 'binary:logistic',
    'eta': 0.01,
    'silent': 1,
    "nthread": 4,
    "random_seed": 17,
    "eval_metric": 'auc',
   
    
    'max_depth':  8,
    'max_leaves': 75,
    'subsample': 0.85, 
    'colsample_bytree': 0.66,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
   
}

xgb_train = xgb.DMatrix(X_train, Y_train, feature_names=X_train.columns)

In [96]:
results = xgb.cv(params_xgb, 
                 xgb_train, 
                 num_boost_round=num_rounds, 
                 folds=skf, 
                 early_stopping_rounds=150, 
                 verbose_eval=1,
                )

[0]	train-auc:0.717196+0.00265508	test-auc:0.676228+0.00324719
[1]	train-auc:0.734875+0.00229185	test-auc:0.696207+0.00408926
[2]	train-auc:0.74205+0.00081801	test-auc:0.701642+0.00269961
[3]	train-auc:0.746545+0.000659963	test-auc:0.704447+0.00391783
[4]	train-auc:0.753499+0.00271026	test-auc:0.708228+0.0024435
[5]	train-auc:0.758079+0.00454949	test-auc:0.711534+0.00376249
[6]	train-auc:0.761214+0.00335533	test-auc:0.711658+0.00337127
[7]	train-auc:0.763094+0.00261457	test-auc:0.713604+0.00294704
[8]	train-auc:0.764674+0.00318112	test-auc:0.714+0.00339636
[9]	train-auc:0.765866+0.00301316	test-auc:0.714074+0.00293739
[10]	train-auc:0.765946+0.00231546	test-auc:0.715002+0.0037759
[11]	train-auc:0.767058+0.00357761	test-auc:0.715453+0.00378781
[12]	train-auc:0.768011+0.00337779	test-auc:0.716137+0.00372909
[13]	train-auc:0.768741+0.00339283	test-auc:0.716622+0.00396137
[14]	train-auc:0.768589+0.00330718	test-auc:0.716902+0.00450149
[15]	train-auc:0.768713+0.00339179	test-auc:0.716762+0.

[129]	train-auc:0.81213+0.00174715	test-auc:0.72754+0.00319221
[130]	train-auc:0.812472+0.00180377	test-auc:0.727609+0.00310932
[131]	train-auc:0.812766+0.00187662	test-auc:0.727621+0.00311786
[132]	train-auc:0.813091+0.00189586	test-auc:0.727737+0.00320765
[133]	train-auc:0.813378+0.00196044	test-auc:0.727755+0.00319429
[134]	train-auc:0.813653+0.00191626	test-auc:0.727812+0.00324975
[135]	train-auc:0.813973+0.00178543	test-auc:0.727849+0.00327374
[136]	train-auc:0.814338+0.00182401	test-auc:0.727954+0.00326268
[137]	train-auc:0.814738+0.00172253	test-auc:0.728039+0.00321522
[138]	train-auc:0.815088+0.00163686	test-auc:0.728119+0.00326328
[139]	train-auc:0.815391+0.00167838	test-auc:0.72824+0.00327919
[140]	train-auc:0.815883+0.00167284	test-auc:0.728345+0.00331567
[141]	train-auc:0.816162+0.00162884	test-auc:0.728379+0.00328178
[142]	train-auc:0.816373+0.00165286	test-auc:0.728393+0.0032819
[143]	train-auc:0.816673+0.00164047	test-auc:0.728512+0.00325016
[144]	train-auc:0.81698+0.001

[256]	train-auc:0.856548+0.00167026	test-auc:0.735038+0.00180342
[257]	train-auc:0.856949+0.00157323	test-auc:0.73509+0.00176314
[258]	train-auc:0.857257+0.00157783	test-auc:0.735106+0.00182572
[259]	train-auc:0.8576+0.00157708	test-auc:0.735181+0.001858
[260]	train-auc:0.857908+0.00161585	test-auc:0.735243+0.00186583
[261]	train-auc:0.858181+0.00160058	test-auc:0.735322+0.0019177
[262]	train-auc:0.858501+0.00155478	test-auc:0.735419+0.00191568
[263]	train-auc:0.858838+0.00150331	test-auc:0.735449+0.00192777
[264]	train-auc:0.859131+0.00150788	test-auc:0.735528+0.00187714
[265]	train-auc:0.85948+0.00151429	test-auc:0.735577+0.0018377
[266]	train-auc:0.859779+0.00143937	test-auc:0.735649+0.00177864
[267]	train-auc:0.860121+0.00145622	test-auc:0.735688+0.00172622
[268]	train-auc:0.860416+0.00147162	test-auc:0.735668+0.00167996
[269]	train-auc:0.860709+0.00144732	test-auc:0.735682+0.00170672
[270]	train-auc:0.860998+0.00148371	test-auc:0.735679+0.00171211
[271]	train-auc:0.861396+0.001455

[383]	train-auc:0.89083+0.00120461	test-auc:0.738936+0.00146481
[384]	train-auc:0.891064+0.00121383	test-auc:0.738981+0.00147903
[385]	train-auc:0.891316+0.00124458	test-auc:0.739007+0.00149098
[386]	train-auc:0.891568+0.00121502	test-auc:0.739012+0.00148607
[387]	train-auc:0.891778+0.00123587	test-auc:0.739052+0.00147805
[388]	train-auc:0.892015+0.00120896	test-auc:0.739082+0.0014672
[389]	train-auc:0.892214+0.00115683	test-auc:0.739139+0.00147229
[390]	train-auc:0.892439+0.00115573	test-auc:0.739158+0.00150325
[391]	train-auc:0.892617+0.00114176	test-auc:0.739148+0.00150335
[392]	train-auc:0.892821+0.00113312	test-auc:0.739193+0.00146945
[393]	train-auc:0.893038+0.0011769	test-auc:0.739191+0.00140651
[394]	train-auc:0.893242+0.00119527	test-auc:0.739228+0.00140166
[395]	train-auc:0.893494+0.00114363	test-auc:0.739255+0.00141934
[396]	train-auc:0.893757+0.00117225	test-auc:0.739286+0.00142911
[397]	train-auc:0.894021+0.00117793	test-auc:0.739288+0.00142364
[398]	train-auc:0.894301+0.0

[510]	train-auc:0.915197+0.00117711	test-auc:0.740825+0.00126838
[511]	train-auc:0.915325+0.00118584	test-auc:0.740846+0.00129076
[512]	train-auc:0.915542+0.00119918	test-auc:0.740847+0.00124182
[513]	train-auc:0.915695+0.00113454	test-auc:0.740865+0.00126175
[514]	train-auc:0.915868+0.00112821	test-auc:0.74083+0.00126517
[515]	train-auc:0.916002+0.00114486	test-auc:0.740837+0.0012626
[516]	train-auc:0.916181+0.00114099	test-auc:0.740848+0.00127936
[517]	train-auc:0.916367+0.0010759	test-auc:0.740801+0.00123892
[518]	train-auc:0.916531+0.000999686	test-auc:0.740836+0.00124014
[519]	train-auc:0.9167+0.00096993	test-auc:0.740837+0.00122899
[520]	train-auc:0.916855+0.000948901	test-auc:0.740832+0.00120809
[521]	train-auc:0.916983+0.000983616	test-auc:0.740837+0.0012118
[522]	train-auc:0.917169+0.000992743	test-auc:0.740878+0.00119797
[523]	train-auc:0.917341+0.000991506	test-auc:0.740884+0.00120023
[524]	train-auc:0.917464+0.00102881	test-auc:0.740885+0.00119849
[525]	train-auc:0.91763+0.

[637]	train-auc:0.933562+0.000700614	test-auc:0.741222+0.00115183
[638]	train-auc:0.933691+0.000697957	test-auc:0.741196+0.0011533
[639]	train-auc:0.93387+0.000706586	test-auc:0.741192+0.00110665
[640]	train-auc:0.934017+0.000690523	test-auc:0.741167+0.00110113
[641]	train-auc:0.934141+0.000670221	test-auc:0.741163+0.00109803
[642]	train-auc:0.934287+0.000666454	test-auc:0.741177+0.00108901
[643]	train-auc:0.934418+0.000666796	test-auc:0.741207+0.0011324
[644]	train-auc:0.934524+0.000660362	test-auc:0.741211+0.0011729
[645]	train-auc:0.934686+0.000658878	test-auc:0.7412+0.00114861
[646]	train-auc:0.934817+0.000668237	test-auc:0.741188+0.00113283
[647]	train-auc:0.934948+0.000697586	test-auc:0.741181+0.00114291
[648]	train-auc:0.935076+0.000696756	test-auc:0.741197+0.0011282
[649]	train-auc:0.935197+0.000639409	test-auc:0.741185+0.00112685
[650]	train-auc:0.935289+0.000656858	test-auc:0.741213+0.00112344
[651]	train-auc:0.935427+0.000672064	test-auc:0.741178+0.00113577
[652]	train-auc:0

[762]	train-auc:0.947397+0.000377439	test-auc:0.741228+0.0014268
[763]	train-auc:0.947515+0.000337527	test-auc:0.741208+0.00141222
[764]	train-auc:0.947628+0.000310304	test-auc:0.741231+0.00141584
[765]	train-auc:0.947684+0.000312634	test-auc:0.741226+0.00141581
[766]	train-auc:0.947773+0.000320901	test-auc:0.741223+0.00139662
[767]	train-auc:0.947892+0.000318574	test-auc:0.741192+0.00139145
[768]	train-auc:0.947973+0.000291017	test-auc:0.741181+0.00139631
[769]	train-auc:0.948112+0.00028986	test-auc:0.741195+0.00138446
[770]	train-auc:0.948231+0.000288405	test-auc:0.74121+0.00139628
[771]	train-auc:0.9483+0.00029033	test-auc:0.741206+0.00139078
[772]	train-auc:0.948404+0.000292595	test-auc:0.741195+0.00139339
[773]	train-auc:0.94847+0.000275152	test-auc:0.741179+0.00140353
[774]	train-auc:0.948598+0.000296999	test-auc:0.741181+0.00140655
[775]	train-auc:0.948701+0.000301083	test-auc:0.741155+0.00140393
[776]	train-auc:0.948778+0.000312726	test-auc:0.74115+0.00141625
[777]	train-auc:0.

[887]	train-auc:0.958674+0.00014207	test-auc:0.740985+0.00188071
[888]	train-auc:0.958712+0.000138027	test-auc:0.741001+0.00187461
[889]	train-auc:0.958759+0.00012454	test-auc:0.740989+0.00188528
[890]	train-auc:0.958862+0.000119123	test-auc:0.740994+0.0018745
[891]	train-auc:0.958959+0.000108718	test-auc:0.740975+0.00188088
[892]	train-auc:0.95901+0.000106697	test-auc:0.740975+0.00186798
[893]	train-auc:0.959076+0.000131305	test-auc:0.740979+0.00184524
[894]	train-auc:0.959163+0.000154933	test-auc:0.740977+0.00181829
[895]	train-auc:0.959246+0.000161248	test-auc:0.740977+0.00185408
[896]	train-auc:0.959332+0.000150471	test-auc:0.740962+0.00186342
[897]	train-auc:0.959415+0.000149495	test-auc:0.740957+0.00182588
[898]	train-auc:0.959498+0.000184859	test-auc:0.740946+0.00181792
[899]	train-auc:0.95957+0.000180013	test-auc:0.740922+0.00181972
[900]	train-auc:0.959632+0.000197864	test-auc:0.740913+0.0018471


In [103]:
xgb_model = xgb.train(params_xgb, xgb_train)

In [131]:
submit_model_xgb(X_train, Y_train, X_test, 'submission7_xgb.csv')

In [104]:
xgbfir.saveXgbFI(
    xgb_model, 
    feature_names=train_cols, 
    OutputXlsxFile='xgbfir_importance.xlsx'
)

In [105]:
temp_x_train = X_train.iloc[:]
temp_x_test = X_test.iloc[:]

In [133]:
temp_x_train['338|339'] = temp_x_train['338'] * 0.6 + temp_x_train['339'] * 0.4
temp_x_train['295|86'] = temp_x_train['295'] * 0.6 + temp_x_train['86'] * 0.4

temp_x_test['338|339'] = temp_x_test['338'] * 0.6 + temp_x_test['339'] * 0.4
temp_x_test['295|86'] = temp_x_test['295'] * 0.6 + temp_x_test['86'] * 0.4

In [132]:
temp_xgb_train = xgb.DMatrix(temp_x_train, Y_train, feature_names=temp_x_train.columns)

results = xgb.cv(params_xgb, 
                 temp_xgb_train, 
                 num_boost_round=num_rounds, 
                 folds=skf, 
                 early_stopping_rounds=150, 
                 verbose_eval=1,
                )

[0]	train-auc:0.71664+0.00044676	test-auc:0.686205+0.0023436
[1]	train-auc:0.735861+0.00368237	test-auc:0.697742+0.00499865
[2]	train-auc:0.744368+0.00324217	test-auc:0.701229+0.00519639
[3]	train-auc:0.750825+0.00234718	test-auc:0.705166+0.00455149
[4]	train-auc:0.753496+0.0034495	test-auc:0.707591+0.00470024
[5]	train-auc:0.756475+0.00435431	test-auc:0.70928+0.00473464
[6]	train-auc:0.758345+0.0039574	test-auc:0.710932+0.00461593
[7]	train-auc:0.759215+0.00406673	test-auc:0.711634+0.00418913
[8]	train-auc:0.760487+0.00450855	test-auc:0.712877+0.00338325
[9]	train-auc:0.761605+0.004439	test-auc:0.71388+0.00272321
[10]	train-auc:0.76309+0.00422979	test-auc:0.714613+0.00327159
[11]	train-auc:0.764292+0.00422463	test-auc:0.71494+0.00293122
[12]	train-auc:0.765263+0.00379541	test-auc:0.715697+0.00267354
[13]	train-auc:0.766471+0.00434057	test-auc:0.716498+0.00222854
[14]	train-auc:0.766901+0.00416841	test-auc:0.716738+0.00254216
[15]	train-auc:0.767595+0.00445224	test-auc:0.716966+0.00224

[129]	train-auc:0.810863+0.00128981	test-auc:0.728173+0.00294976
[130]	train-auc:0.811195+0.00129243	test-auc:0.728152+0.00283164
[131]	train-auc:0.811498+0.00127924	test-auc:0.728158+0.00280174
[132]	train-auc:0.811737+0.00125855	test-auc:0.728175+0.0028305
[133]	train-auc:0.812075+0.0012864	test-auc:0.728208+0.00276857
[134]	train-auc:0.812459+0.00132653	test-auc:0.728269+0.00277647
[135]	train-auc:0.81291+0.00145675	test-auc:0.728363+0.00274651
[136]	train-auc:0.813278+0.00151744	test-auc:0.728431+0.00283539
[137]	train-auc:0.813596+0.00167627	test-auc:0.728492+0.00285104
[138]	train-auc:0.813933+0.00162337	test-auc:0.728526+0.0029045
[139]	train-auc:0.814313+0.0016638	test-auc:0.72865+0.00297612
[140]	train-auc:0.81471+0.00173766	test-auc:0.728704+0.00296639
[141]	train-auc:0.815051+0.00166801	test-auc:0.728756+0.00297416
[142]	train-auc:0.81551+0.0016888	test-auc:0.72882+0.00295187
[143]	train-auc:0.815864+0.00160632	test-auc:0.728929+0.00298528
[144]	train-auc:0.816193+0.00156955

[257]	train-auc:0.856074+0.00149775	test-auc:0.735226+0.00248478
[258]	train-auc:0.856327+0.00146569	test-auc:0.735282+0.00240037
[259]	train-auc:0.85666+0.00148553	test-auc:0.735318+0.00239672
[260]	train-auc:0.856973+0.00152258	test-auc:0.735375+0.0023923
[261]	train-auc:0.857314+0.00154927	test-auc:0.735409+0.00239617
[262]	train-auc:0.857619+0.00158394	test-auc:0.735456+0.00235564
[263]	train-auc:0.857968+0.00153786	test-auc:0.73555+0.00238629
[264]	train-auc:0.858292+0.00156368	test-auc:0.735548+0.00237814
[265]	train-auc:0.858576+0.00156033	test-auc:0.735641+0.00242671
[266]	train-auc:0.858858+0.00155657	test-auc:0.735692+0.0024326
[267]	train-auc:0.859153+0.00157597	test-auc:0.735741+0.00235586
[268]	train-auc:0.859518+0.00158613	test-auc:0.735722+0.00236784
[269]	train-auc:0.859872+0.00157914	test-auc:0.735746+0.002374
[270]	train-auc:0.860206+0.00154599	test-auc:0.735839+0.00235652
[271]	train-auc:0.860487+0.00158113	test-auc:0.735832+0.00233451
[272]	train-auc:0.860776+0.0015

[384]	train-auc:0.890367+0.00110314	test-auc:0.739462+0.00231611
[385]	train-auc:0.890658+0.0011368	test-auc:0.739472+0.00230906
[386]	train-auc:0.890832+0.00116899	test-auc:0.739459+0.00226071
[387]	train-auc:0.89104+0.00119277	test-auc:0.739472+0.00224429
[388]	train-auc:0.891256+0.00119801	test-auc:0.73949+0.00219133
[389]	train-auc:0.891476+0.00121704	test-auc:0.739486+0.0021847
[390]	train-auc:0.891646+0.00123153	test-auc:0.73949+0.0022266
[391]	train-auc:0.891804+0.0012516	test-auc:0.739513+0.00223457
[392]	train-auc:0.892034+0.00130059	test-auc:0.739496+0.00225835
[393]	train-auc:0.892306+0.00134056	test-auc:0.739548+0.00225718
[394]	train-auc:0.892519+0.00132433	test-auc:0.739588+0.00224534
[395]	train-auc:0.892731+0.00134121	test-auc:0.739594+0.00227157
[396]	train-auc:0.892923+0.00136759	test-auc:0.739589+0.0023029
[397]	train-auc:0.893123+0.00138275	test-auc:0.739625+0.00231171
[398]	train-auc:0.893317+0.00143324	test-auc:0.739641+0.00236699
[399]	train-auc:0.893532+0.001441

[511]	train-auc:0.914373+0.00164246	test-auc:0.740846+0.00229774
[512]	train-auc:0.914495+0.00158486	test-auc:0.740855+0.00227459
[513]	train-auc:0.914655+0.0015551	test-auc:0.74086+0.00229221
[514]	train-auc:0.914814+0.00158398	test-auc:0.740861+0.00229271
[515]	train-auc:0.914993+0.00153573	test-auc:0.740879+0.00229929
[516]	train-auc:0.915151+0.00154105	test-auc:0.740871+0.00230453
[517]	train-auc:0.915249+0.0015243	test-auc:0.740874+0.00229516
[518]	train-auc:0.915393+0.00147378	test-auc:0.740919+0.00232
[519]	train-auc:0.915582+0.0014704	test-auc:0.740977+0.00235239
[520]	train-auc:0.91573+0.00148489	test-auc:0.741019+0.00232671
[521]	train-auc:0.915905+0.00147082	test-auc:0.741051+0.00229796
[522]	train-auc:0.916075+0.00154294	test-auc:0.74105+0.00228872
[523]	train-auc:0.916299+0.00149624	test-auc:0.741055+0.00233262
[524]	train-auc:0.916445+0.00146299	test-auc:0.741033+0.00229445
[525]	train-auc:0.916637+0.00148066	test-auc:0.741042+0.00228594
[526]	train-auc:0.916802+0.0014391

[639]	train-auc:0.932876+0.00108599	test-auc:0.741266+0.0023094
[640]	train-auc:0.933003+0.00105711	test-auc:0.741276+0.00232213
[641]	train-auc:0.933159+0.00110034	test-auc:0.741244+0.00230311
[642]	train-auc:0.93325+0.0011206	test-auc:0.741238+0.00231073
[643]	train-auc:0.933366+0.00109251	test-auc:0.741236+0.00233843
[644]	train-auc:0.933489+0.00107929	test-auc:0.741235+0.00236148
[645]	train-auc:0.933589+0.00107495	test-auc:0.741231+0.00234944
[646]	train-auc:0.933756+0.00105822	test-auc:0.741217+0.002365
[647]	train-auc:0.933849+0.00105117	test-auc:0.741187+0.00239019
[648]	train-auc:0.933979+0.00109441	test-auc:0.741207+0.00239479
[649]	train-auc:0.934115+0.00105259	test-auc:0.74119+0.00240485
[650]	train-auc:0.934251+0.00102706	test-auc:0.741196+0.0023983
[651]	train-auc:0.934388+0.00104071	test-auc:0.741262+0.00239111
[652]	train-auc:0.934496+0.00106597	test-auc:0.74129+0.0023674
[653]	train-auc:0.934612+0.00106222	test-auc:0.741286+0.00236378
[654]	train-auc:0.934708+0.0010674

In [137]:
submit_model_xgb(temp_x_train, Y_train, temp_x_test, 'submission7_xgbfir.csv', 720)