In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer

from scipy.stats import mannwhitneyu, kruskal
from scipy.stats import f_oneway

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
data = pd.read_csv("./icr-identify-age-related-conditions/train.csv", index_col = 'Id')

In [4]:
imputer = KNNImputer()
data_to_impute = data.drop(columns = ['EJ', 'Class'], axis = 1)
data_imputed = imputer.fit_transform(data_to_impute)
data_imp = pd.DataFrame(data = data_imputed, index = data_to_impute.index, columns = data_to_impute.columns.to_list())
data1 = pd.concat([data_imp, data.EJ, data.Class], axis = 1)
# data1.isnull().sum()

In [5]:
data_after_drop = data1.drop(columns = ['AH', 'CH', 'CL', 'CS', 'DV', 'EP', 'AR', 'EH', 'FD ', 'BD ', 'BZ'], axis = 1)
data_ad_corr = data_after_drop.drop(columns = ['EJ', 'Class'], axis = 1)

In [6]:
data_to_train = data_after_drop.copy()
data_to_train.isnull().sum().sum()

0

In [7]:
data_to_analyze = data_to_train.drop('EJ', axis = 1) #we need to drop object variable for these tests
data_class_0 = data_to_analyze[data_to_analyze.Class == 0]
data_class_1 = data_to_analyze[data_to_analyze.Class == 1]

features_to_drop_Utest = []
features_to_drop_Htest = []
columns = data_to_analyze.columns.to_list()
columns.remove('Class')

for column in columns:
    U_rank, p_val1 = mannwhitneyu(data_class_0[column], data_class_1[column])
    if p_val1 > 0.05:
        features_to_drop_Utest.append(column)
        print(column, 'U rank:', U_rank, 'P value:', p_val1)
    H_rank, p_val2 = kruskal(data_class_0[column], data_class_1[column])
    if p_val2 > 0.05:
        features_to_drop_Htest.append(column)

#We need to compare the tests performance
features_to_drop_Htest == features_to_drop_Utest #Perfomance checking for the both tests

AX U rank: 24896.5 P value: 0.06193626236565714
AY U rank: 25224.0 P value: 0.05213790869681825
AZ U rank: 26577.5 P value: 0.2945833222186133
BR U rank: 27082.5 P value: 0.4053317110267096
CB U rank: 26218.5 P value: 0.22564101762419575
CF U rank: 27140.0 P value: 0.41865251371788936
DN U rank: 25656.0 P value: 0.13844720667031063
EG U rank: 27423.0 P value: 0.48518450545858305
EU U rank: 25132.0 P value: 0.07933828258328013
FC U rank: 27190.0 P value: 0.43029478011540695
FS U rank: 26596.5 P value: 0.29614626671437755
GB U rank: 26190.0 P value: 0.22066432412325132
GH U rank: 27298.0 P value: 0.4556351542237075


False

In [8]:
features_to_drop_ANOVA = []
columns = data_to_analyze.columns.to_list()
columns.remove('Class')

for column in columns:
    F, p_val = f_oneway(np.log1p(data_class_0[column]), np.log1p(data_class_1[column]))
    if p_val > 0.05:
        features_to_drop_ANOVA.append(column)
        print(column, 'F:', F, 'P value:', p_val)

#We need to compare the tests performance
features_to_drop_Utest == features_to_drop_ANOVA #Perfomance checking for the both tests

AX F: 3.2882719895397545 P value: 0.07026373631825462
AY F: 2.3607184918846453 P value: 0.12493916238381524
AZ F: 0.0007053205100172803 P value: 0.9788209906464335
BR F: 0.1414644475005022 P value: 0.7069591819194481
CB F: 0.41946969660674166 P value: 0.517443469153401
CF F: 1.4361846527152875 P value: 0.2312193467137219
CW  F: 1.7219087512740827 P value: 0.1899370757777987
DN F: 0.9616555298034853 P value: 0.32715572514619407
EG F: 0.0003133294862799697 P value: 0.9858830317986427
EL F: 2.6170012213169622 P value: 0.10623681173590564
EU F: 1.8232246979392086 P value: 0.17742659822745485
FC F: 0.02227917592375351 P value: 0.8813957486296219
FS F: 0.9621431587840991 P value: 0.3270332047916801
GB F: 1.5632519875986108 P value: 0.21166521810832983
GH F: 0.05519165487111272 P value: 0.8143420059794999
GI F: 3.7100316863205522 P value: 0.054546953012873454


False

We'll use intersection of both result arrays to disclude the features.

In [14]:
features_to_drop_final = list(set(features_to_drop_Utest).intersection(set(features_to_drop_ANOVA)))
features_to_drop_final

['FS', 'AX', 'GB', 'EG', 'FC', 'CF', 'AZ', 'CB', 'DN', 'EU', 'GH', 'AY', 'BR']

Let's check the target means for the only object variable. 

In [36]:
data_obj = data_to_train[['EJ', 'Class']]
data_obj.groupby('EJ').mean()

Unnamed: 0_level_0,Class
EJ,Unnamed: 1_level_1
A,0.126126
B,0.202532


We have different means so we retain this variable.

## The final training dataset:

In [37]:
data_to_train.drop(columns = ['FS', 'AX', 'GB', 'EG', 'FC', 'CF', 'AZ', 'CB', 'DN', 'EU', 'GH', 'AY', 'BR'], axis = 1, inplace = True)
data_object = data_to_train.EJ
dummies_df = pd.get_dummies(data_object)
df_train = pd.concat([data_to_train, dummies_df], axis = 1)
df_train.drop('EJ', axis = 1, inplace = True)
df_train

Unnamed: 0_level_0,AB,AF,AM,BC,BN,BP,BQ,CC,CD,CR,CU,DA,DE,DF,DH,DI,DL,DU,DY,EB,EE,FE,FI,FL,FR,GE,GF,GL,Class,A,B
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
000ff2bfdfe9,0.209377,3109.03329,22.394407,5.555634,22.5984,175.638726,152.707705,0.563481,23.387600,0.069225,1.302012,69.08340,295.570575,0.238680,0.284232,89.245560,84.31664,5.310690,23.187704,7.294176,1.987283,9028.291921,3.583450,7.298162,1.73855,72.611063,2003.810319,0.120343,1,0,1
007255e47698,0.145282,978.76416,36.968889,1.229900,19.4205,155.868030,14.754720,0.484710,50.628208,1.117800,1.357182,70.79836,178.553100,0.238680,0.363489,110.581815,75.74548,0.005518,17.222328,4.926396,0.858603,6785.003474,10.358927,0.173229,0.49706,72.611063,27981.562750,21.978000,0,1,0
013f2bd269f5,0.470030,2635.10654,32.360553,1.229900,26.4825,128.988531,219.320160,0.495852,85.955376,0.700350,1.009611,70.81970,321.426625,0.238680,0.210441,120.056438,65.46984,1.289739,36.861352,7.813674,8.146651,8338.906181,11.626917,7.709560,0.97556,88.609437,13676.957810,0.196941,0,0,1
043ac50845d5,0.252107,3819.65177,77.112203,1.229900,23.6577,237.282264,11.050410,0.717882,88.159360,0.636075,0.722727,47.27586,196.607985,0.238680,0.292431,139.824570,71.57120,2.655345,52.003884,7.386060,3.813326,10965.766040,14.852022,6.122162,0.49706,82.416803,2094.262452,0.155829,0,0,1
044fb8a146ec,0.380297,3733.04844,14.103738,102.151980,24.0108,324.546318,149.717165,0.536467,72.644264,0.693150,0.827550,74.06532,200.178160,0.238680,0.207708,97.920120,52.83888,1.144902,9.064856,7.350720,3.490846,16198.049590,13.666727,8.153058,48.50134,146.109943,8524.370502,0.096614,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fd3dafe738fd,0.149555,3130.05946,9.513984,2.804172,21.1860,167.877117,27.287375,0.691257,55.163024,0.698250,1.070298,21.75904,355.930925,0.238680,0.445479,176.977590,90.91832,0.005518,32.508604,8.015112,1.354416,17167.209610,9.879296,0.173229,1.26092,217.148554,8095.932828,21.978000,0,1,0
fd895603f071,0.435846,5462.03438,46.551007,3.777550,27.1887,285.628059,344.644105,0.772304,85.233928,0.761025,2.146113,43.90996,157.393715,0.238680,0.437280,192.598575,123.17624,0.648318,18.197092,8.976360,0.753797,18460.330020,10.910227,10.223150,1.24236,496.994214,3085.308063,0.145340,0,0,1
fd8ef6377f76,0.427300,2459.10720,55.355778,1.229900,20.4798,178.661133,103.988995,0.708616,142.680216,0.879825,1.489590,104.62032,223.209115,0.238680,0.382620,218.915925,326.23620,0.005518,47.552312,9.478188,2.225112,5088.922912,12.029366,0.173229,0.49706,128.896894,6474.652866,21.978000,0,1,0
fe1942975e40,0.363205,1263.53524,23.685856,1.229900,19.0674,119.162529,82.512333,0.602254,122.939496,0.583125,1.428903,51.04140,112.196630,0.532818,0.549333,113.526045,96.97092,0.510378,38.271840,10.078968,1.628524,6464.250832,8.026928,9.256996,0.78764,72.611063,1965.343176,0.184622,0,0,1


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, log_loss, confusion_matrix
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.utils import shuffle

X and y arrays preparation:

In [40]:
X = df_train.drop('Class', axis = 1)
y = df_train.Class

In [41]:
X_shuffle, y_shuffle = shuffle(X, y, random_state = 3247)

## XGBoost algorithm

In [42]:
estimator_XG = XGBClassifier(booster = 'gbtree', 
                             eval_metric = 'logloss',
                             tree_method = 'exact', 
                             random_state = 123, 
                             grow_policy = 'lossguide',
                             eta = 0,
                             learning_rate = 0.2, 
                             gamma = 0
                            )

params = {'n_estimators' : [110], 
          'max_depth' : [5],  
          'subsample' : [1.0],
          'scale_pos_weight' : [9], 
          'base_score' : [0.3],
          'colsample_bynode' : [0.3],
          'colsample_bytree' : [0.6],
          'colsample_bylevel' : [0.8]
         }

search_XG = GridSearchCV(estimator_XG, params, scoring = 'f1', cv = 4)
search_XG.fit(X_shuffle, y_shuffle)

print(search_XG.best_params_, '\n')
print('Best f1 score:', search_XG.best_score_, '\n')

{'base_score': 0.3, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.3, 'colsample_bytree': 0.6, 'max_depth': 5, 'n_estimators': 110, 'scale_pos_weight': 9, 'subsample': 1.0} 

Best f1 score: 0.7693603566245075 



In [43]:
search_XG.best_params_

{'base_score': 0.3,
 'colsample_bylevel': 0.8,
 'colsample_bynode': 0.3,
 'colsample_bytree': 0.6,
 'max_depth': 5,
 'n_estimators': 110,
 'scale_pos_weight': 9,
 'subsample': 1.0}

Cross-validation results:

In [44]:
y_hat = cross_val_predict(search_XG.best_estimator_, X, y, cv = 4)
# print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
# print(classification_report(y, y_hat), '\n')
# print(confusion_matrix(y, y_hat))

Accuracy: 0.948 

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       509
           1       0.85      0.86      0.85       108

    accuracy                           0.95       617
   macro avg       0.91      0.91      0.91       617
weighted avg       0.95      0.95      0.95       617
 

[[492  17]
 [ 15  93]]


In [45]:
# acc_array = cross_val_score(search_XG.best_estimator_, X, y, scoring = 'accuracy', cv = 4)
# print('Accuracies:', acc_array, '\n')
# print('Mean accuracy:', round(acc_array.mean(), 3), 'Std deviation:', round(acc_array.std(), 3))

Accuracies: [0.97419355 0.94155844 0.92857143 0.94805195] 

Mean accuracy: 0.948 Std deviation: 0.017


# Thresholds estimation

Probabilities predictions of the training set will help us to adjust prediction thresholds.

In [46]:
probs = cross_val_predict(search_XG.best_estimator_, X, y, cv = 4, method = 'predict_proba')
prob_0_train = probs[:, 0]
prob_1_train = probs[:, 1]

In [47]:
y_hat[:5]

array([1, 0, 0, 0, 0])

In [48]:
def balanced_log_loss(y_true, y_pred, prob_0, prob_1):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_0 = prob_0
    p_1 = prob_1
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    balanced_log_loss = (w_0 * log_loss_0 + w_1 * log_loss_1) / 2
    # return the average log loss
    return balanced_log_loss

In [49]:
balanced_log_loss(y, y_hat, prob_0_train, prob_1_train)

0.32308097714509953

The balanced log loss result look pretty good.

In [50]:
final_estimator = search_XG.best_estimator_

In [51]:
train_proba = cross_val_predict(final_estimator, X, y, cv = 4, method = 'predict_proba')

df_check = pd.DataFrame({
    #'Id' : X.index,
    'class_0' : train_proba[:, 0],
    'class_1' : train_proba[:, 1],
    'y_train' : y,
    'y_predicted' : y_hat
})

In [52]:
df_class_0 = df_check[['class_0', 'y_train', 'y_predicted']]
df_class_1 = df_check[['class_1', 'y_train', 'y_predicted']]

Mean values for the grouped features:

In [53]:
df_class_0.groupby(['y_train', 'y_predicted']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.973849
0,1,0.165178
1,0,0.862613
1,1,0.123172


Upper thresholds:

In [54]:
df_class_0.groupby(['y_train', 'y_predicted']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.999971
0,1,0.446308
1,0,0.998378
1,1,0.464713


Lower thresholds:

In [55]:
df_class_0.groupby(['y_train', 'y_predicted']).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.541326
0,1,0.011663
1,0,0.566303
1,1,0.000503


In [56]:
df_class_0[(df_check.y_train == 0) & (df_check.y_predicted == 1)]

Unnamed: 0_level_0,class_0,y_train,y_predicted
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48485c8c1ef7,0.016827,0,1
49e14acf8ffb,0.404593,0,1
4a3d7459976a,0.118445,0,1
58dcdd9d6e89,0.01442,0,1
5d547eb551a9,0.011663,0,1
7416fea10b6b,0.125323,0,1
80841417ca53,0.164691,0,1
8408bf4d3d2f,0.085789,0,1
8b9d72eec14e,0.226979,0,1
8e5a0d625d98,0.100663,0,1


In [57]:
df_class_0[(df_check.y_train == 1) & (df_check.y_predicted == 0)]

Unnamed: 0_level_0,class_0,y_train,y_predicted
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
044fb8a146ec,0.660751,1,0
0e798fd4e6e7,0.682231,1,0
24b40510d6a2,0.966235,1,0
3924089e4396,0.954135,1,0
46fe2d3b9e7b,0.709996,1,0
49b8f248a42f,0.921228,1,0
68729b79859b,0.975332,1,0
7cb9a95a62fc,0.994825,1,0
96fbcce4f0c0,0.750496,1,0
9ef1886b810c,0.967663,1,0


I estimate the threshholds here using mean missclassified results.

# Predictions

And finally we need to make predictions for the test set.

In [58]:
test_data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv',  index_col = 'Id')
test_data

Unnamed: 0_level_0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EJ,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
imputer = KNNImputer()
data_test_to_impute = test_data.drop(columns = 'EJ', axis = 1)
data_test_imputed = imputer.fit_transform(data_test_to_impute)
data_test_imp = pd.DataFrame(data = data_test_imputed, index = data_test_to_impute.index, columns = data_test_to_impute.columns.to_list())
data_test_1 = pd.concat([data_test_imp, test_data.EJ], axis = 1)
data_test_1.isnull().sum().sum()

0

In [60]:
len(df_train.columns.to_list())

31

In [61]:
len(data_test_1.columns.to_list())

56

In [62]:
data_test_1.shape

(5, 56)

The test set should have the same properties as the training:

In [63]:
data_test_1.drop(columns = columns_to_drop_corr, axis = 1, inplace = True)
data_test_1.drop(columns = features_to_drop_final, axis = 1, inplace = True)
print(len(data_test_1.columns.to_list()))

data_test_1_object = data_test_1.EJ
dummies_df = pd.get_dummies(data_test_1_object)
df_test = pd.concat([data_test_1, dummies_df], axis = 1)
df_test.drop('EJ', axis = 1, inplace = True)
df_test

29


Unnamed: 0_level_0,AB,AF,AM,BC,BN,BP,BQ,CC,CD,CR,CU,DA,DE,DF,DH,DI,DL,DU,DY,EB,EE,FE,FI,FL,FR,GE,GF,GL,A
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [64]:
df_test['B'] = np.abs(df_test['A'] - 1)
df_test.shape

(5, 30)

In [65]:
X_t = df_test
prob_predictions = final_estimator.predict_proba(X_t)
prob_predictions

array([[0.88426834, 0.11573168],
       [0.88426834, 0.11573168],
       [0.88426834, 0.11573168],
       [0.88426834, 0.11573168],
       [0.88426834, 0.11573168]], dtype=float32)

In [66]:
X_t.index

Index(['00eed32682bb', '010ebe33f668', '02fa521e1838', '040e15f562a2',
       '046e85c7cc7f'],
      dtype='object', name='Id')

In [67]:
prob_0 = prob_predictions[:, 0]
prob_0

array([0.88426834, 0.88426834, 0.88426834, 0.88426834, 0.88426834],
      dtype=float32)

In [68]:
#thresholds are taken from Thresholds estimation
#there was prob_0 > 0.86 and I'm trying new threshold:
prob_0[prob_0 > 0.72] = 1
#prob_0[prob_0 < 0.12] = 0
prob_1 = 1 - prob_0
prob_1

array([0., 0., 0., 0., 0.], dtype=float32)

In [69]:
submission = pd.DataFrame({
    'Id' : X_t.index,
    'class_0' : prob_0,
    'class_1' : prob_1
})

In [70]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,1.0,0.0
1,010ebe33f668,1.0,0.0
2,02fa521e1838,1.0,0.0
3,040e15f562a2,1.0,0.0
4,046e85c7cc7f,1.0,0.0


In [71]:
submission.to_csv('submission.csv',  index = False)