In [1]:
# !pip install pycaret

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# from pycaret.classification import setup, evaluate_model, compare_models, plot_model, add_metric
import optuna

# Load Data

In [2]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
sample_submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')

In [3]:
train_0 = train.drop(['Id', 'Class'], axis=1).copy()
y = train.Class

In [4]:
pd.set_option('display.max_columns', None)

# Explore Data

In [5]:
train_0.shape

(617, 56)

In [6]:
train_0.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EJ,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,22.5984,175.638726,152.707705,823.928241,257.432377,47.223358,0.563481,23.3876,4.851915,0.023482,1.050225,0.069225,13.784111,1.302012,36.205956,69.0834,295.570575,0.23868,0.284232,89.24556,84.31664,29.657104,5.31069,1.74307,23.187704,7.294176,1.987283,1433.16675,0.949104,B,30.87942,78.526968,3.828384,13.39464,10.265073,9028.291921,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,19.4205,155.86803,14.75472,51.216883,257.432377,30.284345,0.48471,50.628208,6.085041,0.031442,1.113875,1.1178,28.310953,1.357182,37.476568,70.79836,178.5531,0.23868,0.363489,110.581815,75.74548,37.532,0.005518,1.74307,17.222328,4.926396,0.858603,1111.28715,0.003042,A,109.125159,95.415086,52.26048,17.175984,0.29685,6785.003474,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,26.4825,128.988531,219.32016,482.141594,257.432377,32.563713,0.495852,85.955376,5.376488,0.036218,1.050225,0.70035,39.364743,1.009611,21.459644,70.8197,321.426625,0.23868,0.210441,120.056438,65.46984,28.053464,1.289739,1.74307,36.861352,7.813674,8.146651,1494.076488,0.377208,B,109.125159,78.526968,5.390628,224.207424,8.745201,8338.906181,11.626917,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,23.6577,237.282264,11.05041,661.51864,257.432377,15.201914,0.717882,88.15936,2.347652,0.029054,1.4003,0.636075,41.11696,0.722727,21.530392,47.27586,196.607985,0.23868,0.292431,139.82457,71.5712,24.354856,2.655345,1.74307,52.003884,7.38606,3.813326,15691.55218,0.614484,B,31.674357,78.526968,31.323372,59.301984,7.884336,10965.76604,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,24.0108,324.546318,149.717165,6074.859475,257.432377,82.213495,0.536467,72.644264,30.537722,0.025472,1.050225,0.69315,31.724726,0.82755,34.41536,74.06532,200.17816,0.23868,0.207708,97.92012,52.83888,26.019912,1.144902,1.74307,9.064856,7.35072,3.490846,1403.6563,0.164268,B,109.125159,91.994825,51.141336,29.10264,4.27464,16198.04959,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614


In [7]:
train_0.describe()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
count,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,557.0,617.0,617.0,615.0,614.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,616.0,617.0,617.0,617.0,617.0,617.0,617.0,557.0,617.0,617.0,616.0,617.0,617.0,617.0,616.0,617.0,615.0,617.0,617.0,617.0,617.0,617.0,616.0
mean,0.477149,3502.013221,118.624513,38.968552,10.128242,5.545576,0.06032,10.566447,8.053012,5350.388655,21.419492,231.322223,98.328737,1218.133238,550.632525,77.104151,0.688801,90.251735,11.241064,0.030615,1.403761,0.742262,36.91759,1.383792,27.165653,51.128326,401.901299,0.633884,0.367002,146.972099,94.795377,26.370568,1.8029,1.92483,26.388989,9.0727,3.064778,1731.248215,0.305107,69.582596,105.060712,69.117005,71.341526,6.930086,10306.810737,10.111079,5.433199,3.533905,0.421501,20.724856,131.714987,14679.595398,31.489716,50.584437,8.530961
std,0.468388,2300.322717,127.83895,69.728226,10.518877,2.551696,0.416817,4.350645,65.166943,3021.326641,3.478278,183.992505,96.479371,7575.293707,2076.371275,159.049302,0.263994,51.58513,13.571133,0.014808,1.92221,0.281195,17.266347,0.538717,14.645993,21.210888,317.745623,1.912384,0.112989,86.084419,28.243187,8.038825,9.034721,1.484555,18.116679,6.200281,2.058344,1790.227476,1.847499,38.555707,68.44562,390.187057,165.551545,64.754262,11331.294051,2.934025,11.496257,50.181948,1.305365,9.991907,144.181524,19352.959387,9.864239,36.266251,10.32701
min,0.081187,192.59328,85.200147,3.177522,8.138688,0.699861,0.025578,3.396778,1.2299,1693.62432,9.8868,72.948951,1.331155,51.216883,257.432377,12.49976,0.176874,23.3876,0.510888,0.003184,1.050225,0.069225,13.784111,0.137925,7.03064,6.9064,35.998895,0.23868,0.040995,60.23247,10.3456,6.339496,0.005518,1.74307,0.804068,4.926396,0.286201,185.5941,0.003042,5.394675,78.526968,3.828384,7.534128,0.29685,1563.136688,3.58345,0.173229,0.49706,0.06773,4.102182,72.611063,13.038894,9.432735,0.897628,0.001129
25%,0.252107,2197.34548,85.200147,12.270314,8.138688,4.128294,0.025578,8.12958,1.2299,4155.70287,19.4205,156.847239,27.834425,424.990642,257.432377,23.317567,0.563688,64.724192,5.066306,0.023482,1.050225,0.589575,29.782467,1.070298,7.03064,37.94252,188.81569,0.23868,0.295164,102.703553,78.23224,20.888264,0.005518,1.74307,14.715792,5.965392,1.648679,1111.160625,0.003042,30.927468,78.526968,4.324656,25.815384,0.29685,5164.66626,8.523098,0.173229,0.49706,0.06773,14.036718,72.611063,2798.992584,25.034888,23.011684,0.124392
50%,0.354659,3120.31896,85.200147,20.53311,8.138688,5.031912,0.025578,10.46132,1.2299,4997.96073,21.186,193.908816,61.642115,627.417402,257.432377,42.55433,0.658715,79.819104,9.123,0.02786,1.050225,0.7308,34.83513,1.351665,36.019104,49.18094,307.509595,0.23868,0.358023,130.05063,96.26496,25.2488,0.251741,1.74307,21.642456,8.149404,2.616119,1493.817413,0.085176,71.949306,78.526968,22.641144,36.394008,1.870155,7345.143424,9.945452,3.028141,1.131,0.250601,18.771436,72.611063,7838.27361,30.608946,41.007968,0.337827
75%,0.559763,4361.63739,113.73954,39.139886,8.138688,6.431634,0.036845,12.969516,5.081244,6035.8857,23.6577,247.803462,134.009015,975.649259,257.432377,77.310097,0.772206,99.81352,13.565901,0.034427,1.228445,0.85935,40.529401,1.660617,37.935832,61.40876,507.8962,0.23868,0.426348,165.836955,110.64068,30.544224,1.05869,1.74307,34.058344,10.503048,3.91007,1905.701475,0.237276,109.125159,112.766654,49.085352,56.714448,4.880214,10647.95165,11.516657,6.238814,1.51206,0.535067,25.608406,127.591671,19035.70924,36.863947,67.931664,21.978
max,6.161666,28688.18766,1910.123198,630.51823,178.943634,38.27088,10.315851,38.971568,1463.693448,53060.59924,29.3073,2447.81055,344.644105,179250.2529,50092.4593,2271.436167,4.103032,633.534408,200.967526,0.224074,31.688153,3.039675,267.942823,4.951507,64.521624,210.33092,2103.40519,37.895013,1.060404,1049.168078,326.2362,62.808096,161.355315,25.19293,152.355164,94.95858,18.324926,30243.75878,42.569748,109.125159,1063.594578,6501.26448,3030.655824,1578.654237,143224.6823,35.851039,137.932739,1244.22702,31.365763,135.781294,1497.351958,143790.0712,81.210825,191.194764,21.978


In [8]:
null_cols = train_0.columns[train_0.isnull().sum()>0]
null_cols

Index(['BQ', 'CB', 'CC', 'DU', 'EL', 'FC', 'FL', 'FS', 'GL'], dtype='object')

In [9]:
train_0[null_cols].isnull().sum()

BQ    60
CB     2
CC     3
DU     1
EL    60
FC     1
FL     1
FS     2
GL     1
dtype: int64

# Impute Null Values

In [10]:
train_1 = train_0.fillna(train_0.median())

  train_1 = train_0.fillna(train_0.median())


# Encore categorical column

In [11]:
encoder = LabelEncoder()
train_1.EJ = encoder.fit_transform(train_1.EJ)

# Data Scaling

In [12]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(train_1), columns=train_1.columns)

# Target Variable

In [13]:
print(f'Percentage of Class 1: {(y.value_counts()[0]/y.shape[0]).round(3)}')
print(f'Percentage of Class 0: {(y.value_counts()[1]/y.shape[0]).round(3)}')
y.value_counts()

Percentage of Class 1: 0.825
Percentage of Class 0: 0.175


0    509
1    108
Name: Class, dtype: int64

# Base Model

In [14]:
def balance_loglossv2(y_true, y_pred):
    target_mean = y_true.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y_true]
    loss = log_loss(y_true, y_pred, sample_weight=sample_weight)

    return loss

In [15]:
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=0)

lr = LogisticRegression()
lr_scores = []

# lr_scores.append(cross_val_score(lr, X, y, scoring=log_loss))

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train, y_train)
    y_pred = lr.predict_proba(X_test)[:, 1].reshape(-1)
    
    lr_scores.append(balance_loglossv2(y_test, y_pred))
    
print(f'Logistic Regression score: {np.mean(lr_scores).round(3)}')
print(f'Logistic Regression std: {np.std(lr_scores).round(3)}')

Logistic Regression score: 0.672
Logistic Regression std: 0.067


In [16]:
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=0)

rf = RandomForestClassifier()
rf_scores = []

# lr_scores.append(cross_val_score(lr, X, y, scoring=log_loss))

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict_proba(X_test)[:, 1].reshape(-1)
    
    rf_scores.append(balance_loglossv2(y_test, y_pred))
    
print(f'Random Forest Regressor score: {np.mean(rf_scores).round(3)}')
print(f'Random Forest Regressor std: {np.std(rf_scores).round(3)}')

Random Forest Regressor score: 0.467
Random Forest Regressor std: 0.112


# Compare Models

In [17]:
_ = setup(data=pd.concat([X, y], axis=1), target='Class')

Unnamed: 0,Description,Value
0,Session id,3438
1,Target,Class
2,Target type,Binary
3,Original data shape,"(617, 57)"
4,Transformed data shape,"(617, 57)"
5,Transformed train set shape,"(431, 57)"
6,Transformed test set shape,"(186, 57)"
7,Numeric features,56
8,Preprocess,True
9,Imputation type,simple


In [18]:
add_metric(id='log_loss', name='LOG_LOSS', score_func=log_loss)

Name                                              LOG_LOSS
Display Name                                      LOG_LOSS
Score Function       <function log_loss at 0x793a2827f910>
Scorer                               make_scorer(log_loss)
Target                                                pred
Args                                                    {}
Greater is Better                                     True
Multiclass                                            True
Custom                                                True
Name: log_loss, dtype: object

In [19]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LOG_LOSS,TT (Sec)
rf,Random Forest Classifier,0.9234,0.957,0.6196,0.9405,0.7137,0.6774,0.7102,2.7604,0.25
gbc,Gradient Boosting Classifier,0.9165,0.945,0.6607,0.8407,0.7248,0.6788,0.6938,3.01,0.277
catboost,CatBoost Classifier,0.9164,0.9704,0.6071,0.9092,0.6904,0.6505,0.6844,3.0138,5.297
lightgbm,Light Gradient Boosting Machine,0.9142,0.9616,0.6607,0.8092,0.7081,0.6627,0.6769,3.0938,0.371
xgboost,Extreme Gradient Boosting,0.9119,0.9593,0.6518,0.8108,0.7078,0.6595,0.6729,3.1757,0.12
ada,Ada Boost Classifier,0.9117,0.9432,0.6929,0.7955,0.7253,0.6741,0.6854,3.1814,0.166
et,Extra Trees Classifier,0.9002,0.9631,0.4857,0.93,0.6024,0.5583,0.6102,3.5967,0.244
ridge,Ridge Classifier,0.891,0.0,0.4589,0.8348,0.5714,0.5217,0.5599,3.9301,0.028
lda,Linear Discriminant Analysis,0.8886,0.9147,0.5732,0.7379,0.6391,0.575,0.5849,4.014,0.04
svm,SVM - Linear Kernel,0.8863,0.0,0.5589,0.7687,0.5989,0.5418,0.575,4.0978,0.027


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

# Params Tuning

# LightGBM

In [20]:
def lightgbm(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
        'random_state': 0
    }

    model = LGBMClassifier(**params)

    results = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        results.append(balance_loglossv2(y_test, y_pred))
            
    return np.mean(results)

# XGBoost

In [21]:
def xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-10, 1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-10, 1),
        'gamma': trial.suggest_loguniform('gamma', 1e-10, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric': 'mlogloss',
        'random_state': 0,
        'n_jobs': -1
    }
    
    model = XGBClassifier(**params)

    results = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        results.append(balance_loglossv2(y_test, y_pred))
    
    return np.mean(results)

# Catboost

In [22]:
def catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0),
        'random_seed': 0,
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'verbose': False
    }
    
    results = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=False)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        results.append(balance_loglossv2(y_test, y_pred))
    
    return np.mean(results)

# Random Forest

In [23]:
def rf(trial):
    max_depth = trial.suggest_int('max_depth', 1, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    random_state = 0
          
    model = RandomForestClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=random_state
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        results.append(balance_loglossv2(y_test, y_pred))
    
    return np.mean(results)

# Gradient Boosting

In [24]:
def gbc(trial):
    #tol = trial.suggest_loguniform('tol', 1e-8, 10.0)
    max_depth = trial.suggest_int('max_depth', 1, 50)
    learning_rate = trial.suggest_loguniform('learning_rate', .001, 1)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 100)
    random_state = 0
          
    model = GradientBoostingClassifier(
        #tol=tol,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        random_state=random_state
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        results.append(balance_loglossv2(y_test, y_pred))
    
    return np.mean(results)

In [25]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lightgbm, n_trials=100)
# study.best_params

In [26]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(xgb, n_trials=100)
# study.best_params

In [27]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(catboost, n_trials=100)
# study.best_params

In [28]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(rf, n_trials=100)
# study.best_params

In [29]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(gbc, n_trials=100)
# study.best_params

In [17]:
lightgbm_params = {
    'n_estimators': 381,
    'max_depth': 41,
    'num_leaves': 40,
    'learning_rate': 0.04226293637542078,
    'min_child_samples': 85,
    'subsample': 0.3840208956748839,
    'colsample_bytree': 0.2175935372225782,
    'reg_alpha': 1.441590199129099e-09,
    'reg_lambda': 3.2224838933410242e-09,
    'force_col_wise': True,
    'verbose': -1
}

xgb_params = {
    'n_estimators': 821,
    'max_depth': 10,
    'learning_rate': 0.01677173900768991,
    'subsample': 0.8572143331039203,
    'colsample_bytree': 0.5665388026502962,
    'reg_alpha': 3.409425580142812e-10,
    'reg_lambda': 7.315443625060416e-05,
    'gamma': 0.12097663988470606,
    'min_child_weight': 4
}
catboost_params = {
    'iterations': 407,
    'depth': 3,
    'learning_rate': 0.08469439310905212,
    'l2_leaf_reg': 2.7207891695705144,
    'logging_level': 'Silent'
}

rf_params = {
    'max_depth': 63,
    'n_estimators': 221,
    'min_samples_leaf': 3,
    'min_samples_split': 7
}



gbc_params = {
    'max_depth': 30,
    'learning_rate': 0.1344660323554811,
    'n_estimators': 102,
    'min_samples_leaf': 69,
    'max_leaf_nodes': 9
}

In [18]:
lightgbm_model = LGBMClassifier(**lightgbm_params)
xgb_model = XGBClassifier(**xgb_params)
rf_model = RandomForestClassifier(**rf_params)
cat_model = CatBoostClassifier(**catboost_params)
gbc_model = GradientBoostingClassifier(**gbc_params)

# Models Evaluation

In [19]:
models = {
    'lightgbm': lightgbm_model,
    'xgb': xgb_model,
    'rf': rf_model,
    'cat': cat_model,
    'gbc': gbc_model
}

In [20]:
results_ensemble_models = {}

for name, model in models.items():
    res=[]
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        res.append(balance_loglossv2(y_test, y_pred))
    results_ensemble_models[name] = res

In [21]:
for name, result in results_ensemble_models.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

----------
lightgbm
0.4237760457903404
0.10712851794883253
----------
xgb
0.38216076661375675
0.07730683549758163
----------
rf
0.46855853961504146
0.05454695090186046
----------
cat
0.46047316401882765
0.12849492944444155
----------
gbc
0.37380119365573977
0.09936713902656993


In [22]:
final_model = VotingClassifier(estimators=[('xgb', xgb_model),
                                           ('gbc', gbc_model)], 
                               voting='soft')

results_ensemble = []

for i, (train_index, test_index) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    final_model.fit(X_train, y_train)
    y_pred = final_model.predict_proba(X_test)
    y_pred = final_model.predict_proba(X_test)[:, 1].reshape(-1)
    results_ensemble.append(balance_loglossv2(y_test, y_pred))

print(np.mean(results_ensemble))

0.36668355898661725


# Model Training

In [23]:
final_model.fit(X, y)

# Prediction

In [24]:
test_final = test.drop('Id', axis=1).copy()
test_final.EJ = encoder.transform(test_final.EJ)

test_final = pd.DataFrame(scaler.transform(test_final), columns=test_final.columns)

In [25]:
final_predictions = pd.DataFrame(final_model.predict_proba(test_final), columns=['class_0', 'class_1'])
final_predictions = pd.concat([test.Id, final_predictions], axis=1)
final_predictions

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.662085,0.337915
1,010ebe33f668,0.662085,0.337915
2,02fa521e1838,0.662085,0.337915
3,040e15f562a2,0.662085,0.337915
4,046e85c7cc7f,0.662085,0.337915


# Submission

In [26]:
final_predictions.to_csv('submission.csv', index=False)