In [3]:
import pandas as pd
import numpy as np 

import seaborn as sns 

from os.path import join as path_join

from sklearn.tree import ExtraTreeClassifier
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import auc , roc_curve, precision_recall_curve, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
warnings.simplefilter('ignore')

In [4]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'train.csv'), sep='\t')
test = pd.read_csv(path_join(CSV_DIR, 'test.csv'), sep='\t')

In [5]:
const_columns = ['9', '140', '164'] 
const_in_test_1 = ['11']
const_in_test = ['5', '129', '130', '137', '138', '141', '149', '150', '178', '186', '188', '192', '193', '291', '301', '303', '305']
zero_train_features = ['152', '160']


def preprocess(train_df, test_df):
    train = train_df.drop(const_columns, axis=1)
    test = test_df.drop(const_columns, axis=1)
    
    for column in const_in_test:
        train = train[train[column] == 0].drop(column, axis=1)
        test.drop(column, axis=1, inplace=True)
        
    for column in const_in_test_1:
        train = train[train[column] == 1].drop(column, axis=1)
        test.drop(column, axis=1, inplace=True)
        
    X_train = train.drop('0', axis=1).reset_index(drop=True)
    y_train = train['0'].reset_index(drop=True)
    X_test = test.drop('0', axis=1).reset_index(drop=True)
    return X_train, y_train, X_test

xtrain, ytrain, xtest = preprocess(train, test)

In [6]:
skf = StratifiedKFold(n_splits=5, random_state=42)

In [23]:
def to_submission(predictions, filename):
    seria = pd.Series(predictions, name='_VAL_')
    seria.to_csv(path_join(CSV_DIR, filename), header=True, index_label='_ID_')

In [11]:
params_xgb = {
   
    'objective': 'binary:logistic',
    'eta': 0.01,
    'silent': 1,
    "nthread": 4,
    "random_seed": 17,
    "eval_metric": 'auc',
   
    
    'max_depth':  8,
    'max_leaves': 75,
    'subsample': 0.85, 
    'colsample_bytree': 0.66,

    'tree_method': 'hist',
    'grow_policy': 'lossguide'
   
}
nrounds = 10000
xgb_train = xgb.DMatrix(xtrain, ytrain, feature_names=xtrain.columns)

In [13]:
xgb_test= xgb.DMatrix(xtest)
xgb_model = xgb.train(params_xgb, xgb_train, 1007)
xgb_predictions = xgb_model.predict(xgb_test)

## ligthgbm

In [15]:
params_lgb = {
    #default
    'objective': 'binary',
    'learning_rate': 0.01,
    'num_threads': 4,
    "metric": 'auc',
    "seed": 42,
    
    #regularization
    'colsample_bytree': 0.66,
    'subsample': 0.8,
    'subsample_freq': 1,
    
    'max_depth': 8, 
    'num_leaves': 128, 
    'min_data_in_leaf' : 17
    
    }

n_rounds = 10000
lgb_train = lgb.Dataset(xtrain, label=ytrain,  free_raw_data=False)

In [16]:
lgb_model = lgb.train(params_lgb, lgb_train, 820)

lgb_predictions = lgb_model.predict(xtest)

## Catboost

In [17]:
# categorical_features_indices = np.where(xtrain.dtypes != np.float)[0]
params_ctb = {
    
    'eval_metric': 'AUC',
    'iterations': 10000,
    'learning_rate': 0.03,
    'random_seed': 42,
    'od_wait': 100,
    'od_type': 'Iter',
    'thread_count':4,
    
    
    #regularization
    'depth':  10
}

ctb_train = ctb.Pool(xtrain,ytrain)

In [18]:
ctb_train = ctb.Pool(xtrain,ytrain)
ctb_model = ctb.train(ctb_train, 
                      params_ctb, 
                      logging_level='Verbose',
                      num_boost_round=557
                     )
ctb_predictions = ctb_model.predict(ctb.Pool(xtest))

0:	learn: 0.6874301	total: 812ms	remaining: 7m 31s
1:	learn: 0.6976041	total: 1.25s	remaining: 5m 47s
2:	learn: 0.6971049	total: 1.7s	remaining: 5m 13s
3:	learn: 0.6968453	total: 2.5s	remaining: 5m 45s
4:	learn: 0.6977063	total: 3.26s	remaining: 5m 59s
5:	learn: 0.7014916	total: 4.03s	remaining: 6m 9s
6:	learn: 0.7036193	total: 4.88s	remaining: 6m 23s
7:	learn: 0.7067070	total: 5.64s	remaining: 6m 26s
8:	learn: 0.7065807	total: 6.43s	remaining: 6m 31s
9:	learn: 0.7054639	total: 7.25s	remaining: 6m 36s
10:	learn: 0.7091063	total: 8.07s	remaining: 6m 40s
11:	learn: 0.7092459	total: 8.82s	remaining: 6m 40s
12:	learn: 0.7097168	total: 9.87s	remaining: 6m 52s
13:	learn: 0.7107044	total: 11s	remaining: 7m 6s
14:	learn: 0.7120760	total: 11.9s	remaining: 7m 10s
15:	learn: 0.7123159	total: 12.8s	remaining: 7m 14s
16:	learn: 0.7137969	total: 13.6s	remaining: 7m 12s
17:	learn: 0.7147006	total: 14.7s	remaining: 7m 20s
18:	learn: 0.7158167	total: 15.5s	remaining: 7m 18s
19:	learn: 0.7163511	total: 

157:	learn: 0.7699808	total: 2m 7s	remaining: 5m 21s
158:	learn: 0.7703391	total: 2m 7s	remaining: 5m 20s
159:	learn: 0.7705607	total: 2m 8s	remaining: 5m 19s
160:	learn: 0.7707660	total: 2m 9s	remaining: 5m 18s
161:	learn: 0.7709826	total: 2m 10s	remaining: 5m 17s
162:	learn: 0.7714604	total: 2m 11s	remaining: 5m 16s
163:	learn: 0.7717275	total: 2m 11s	remaining: 5m 16s
164:	learn: 0.7721919	total: 2m 12s	remaining: 5m 15s
165:	learn: 0.7724653	total: 2m 13s	remaining: 5m 14s
166:	learn: 0.7726349	total: 2m 14s	remaining: 5m 13s
167:	learn: 0.7727113	total: 2m 15s	remaining: 5m 13s
168:	learn: 0.7728692	total: 2m 16s	remaining: 5m 13s
169:	learn: 0.7730824	total: 2m 17s	remaining: 5m 13s
170:	learn: 0.7732770	total: 2m 18s	remaining: 5m 12s
171:	learn: 0.7735670	total: 2m 19s	remaining: 5m 11s
172:	learn: 0.7737898	total: 2m 20s	remaining: 5m 10s
173:	learn: 0.7741535	total: 2m 20s	remaining: 5m 10s
174:	learn: 0.7745256	total: 2m 21s	remaining: 5m 9s
175:	learn: 0.7746510	total: 2m 2

310:	learn: 0.8118158	total: 4m 14s	remaining: 3m 20s
311:	learn: 0.8122417	total: 4m 14s	remaining: 3m 20s
312:	learn: 0.8124829	total: 4m 15s	remaining: 3m 19s
313:	learn: 0.8127229	total: 4m 16s	remaining: 3m 18s
314:	learn: 0.8129719	total: 4m 17s	remaining: 3m 17s
315:	learn: 0.8131691	total: 4m 17s	remaining: 3m 16s
316:	learn: 0.8133472	total: 4m 18s	remaining: 3m 15s
317:	learn: 0.8135724	total: 4m 19s	remaining: 3m 14s
318:	learn: 0.8143079	total: 4m 20s	remaining: 3m 14s
319:	learn: 0.8146025	total: 4m 21s	remaining: 3m 13s
320:	learn: 0.8148918	total: 4m 21s	remaining: 3m 12s
321:	learn: 0.8152780	total: 4m 22s	remaining: 3m 11s
322:	learn: 0.8155039	total: 4m 23s	remaining: 3m 10s
323:	learn: 0.8157989	total: 4m 24s	remaining: 3m 9s
324:	learn: 0.8161501	total: 4m 24s	remaining: 3m 9s
325:	learn: 0.8162885	total: 4m 25s	remaining: 3m 8s
326:	learn: 0.8165325	total: 4m 26s	remaining: 3m 7s
327:	learn: 0.8167137	total: 4m 27s	remaining: 3m 6s
328:	learn: 0.8169866	total: 4m 2

463:	learn: 0.8550482	total: 6m 17s	remaining: 1m 15s
464:	learn: 0.8551968	total: 6m 17s	remaining: 1m 14s
465:	learn: 0.8553937	total: 6m 18s	remaining: 1m 13s
466:	learn: 0.8554978	total: 6m 19s	remaining: 1m 13s
467:	learn: 0.8557092	total: 6m 20s	remaining: 1m 12s
468:	learn: 0.8562238	total: 6m 20s	remaining: 1m 11s
469:	learn: 0.8564776	total: 6m 21s	remaining: 1m 10s
470:	learn: 0.8567305	total: 6m 22s	remaining: 1m 9s
471:	learn: 0.8569255	total: 6m 22s	remaining: 1m 8s
472:	learn: 0.8570876	total: 6m 23s	remaining: 1m 8s
473:	learn: 0.8572681	total: 6m 24s	remaining: 1m 7s
474:	learn: 0.8574433	total: 6m 25s	remaining: 1m 6s
475:	learn: 0.8577636	total: 6m 25s	remaining: 1m 5s
476:	learn: 0.8579744	total: 6m 26s	remaining: 1m 4s
477:	learn: 0.8582610	total: 6m 27s	remaining: 1m 3s
478:	learn: 0.8584914	total: 6m 27s	remaining: 1m 3s
479:	learn: 0.8588541	total: 6m 28s	remaining: 1m 2s
480:	learn: 0.8590021	total: 6m 29s	remaining: 1m 1s
481:	learn: 0.8592015	total: 6m 30s	rem

In [19]:
weighted_mean_predictions  = xgb_predictions * 0.5 +  lgb_predictions *  0.2 +  ctb_predictions * 0.3

In [20]:
weighted_mean_predictions

array([0.13366732, 0.53832549, 0.26599847, ..., 0.08124965, 0.10442635,
       0.1041543 ])

In [22]:
to_submission(weighted_mean_predictions, 'submission7_additional')

- CV: 0.749662858331205
- LB: 0.76586872