In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [2]:
# the data is from
# https://www.kaggle.com/code/randyrose2017/for-beginners-using-keras-to-build-models/data
df = pd.read_csv('./input/cancer.csv')
df.head(1)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [3]:
df.shape

(858, 36)

In [4]:
df.columns = [c.lower() for c in df.columns]

In [8]:
df = df.replace('?', np.nan)

In [11]:
df.iloc[:, :19].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   age                                 858 non-null    int64 
 1   number of sexual partners           832 non-null    object
 2   first sexual intercourse            851 non-null    object
 3   num of pregnancies                  802 non-null    object
 4   smokes                              845 non-null    object
 5   smokes (years)                      845 non-null    object
 6   smokes (packs/year)                 845 non-null    object
 7   hormonal contraceptives             750 non-null    object
 8   hormonal contraceptives (years)     750 non-null    object
 9   iud                                 741 non-null    object
 10  iud (years)                         741 non-null    object
 11  stds                                753 non-null    object

In [12]:
df.iloc[:, 19:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   stds:genital herpes               753 non-null    object
 1   stds:molluscum contagiosum        753 non-null    object
 2   stds:aids                         753 non-null    object
 3   stds:hiv                          753 non-null    object
 4   stds:hepatitis b                  753 non-null    object
 5   stds:hpv                          753 non-null    object
 6   stds: number of diagnosis         858 non-null    int64 
 7   stds: time since first diagnosis  71 non-null     object
 8   stds: time since last diagnosis   71 non-null     object
 9   dx:cancer                         858 non-null    int64 
 10  dx:cin                            858 non-null    int64 
 11  dx:hpv                            858 non-null    int64 
 12  dx                    

In [7]:
df.describe()

Unnamed: 0,age,stds: number of diagnosis,dx:cancer,dx:cin,dx:hpv,dx,hinselmann,schiller,citology,biopsy
count,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,0.087413,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,0.302545,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
df = df.apply(pd.to_numeric)

In [29]:
df['biopsy'].value_counts()

0    803
1     55
Name: biopsy, dtype: int64

In [100]:
for col, val in df.apply(lambda x: x.min(skipna=True)).to_dict().items():
    df[col][df[col].isnull()] = val - 1 # min - 1 for a GBDT model

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [103]:
# all non-nan
assert df.notnull().all().all()

# With no treatment

In [104]:
X = df.drop(['biopsy'], axis=1).values
y = df['biopsy'].values

In [105]:
# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]
    
    # prepare datasets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's binary_logloss: 0.178452
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.156323
[3]	valid_0's binary_logloss: 0.142943
[4]	valid_0's binary_logloss: 0.133174
[5]	valid_0's binary_logloss: 0.12526
[6]	valid_0's binary_logloss: 0.118559
[7]	valid_0's binary_logloss: 0.11357
[8]	valid_0's binary_logloss: 0.109454
[9]	valid_0's binary_logloss: 0.106
[10]	valid_0's binary_logloss: 0.10251
[11]	valid_0's binary_logloss: 0.0997617
[12]	valid_0's binary_logloss: 0.0974625
[13]	valid_0's binary_logloss: 0.0951449
[14]	valid_0's binary_logloss: 0.093838
[15]	valid_0's binary_logloss: 0.0927305
[16]	valid_0's binary_logloss: 0.0912988
[17]	valid_0's binary_logloss: 0.0900782
[18]	valid_0's binary_logloss: 0.0889974
[19]	valid_0's binary_logloss: 0.0881315
[20]	valid_0's binary_logloss: 0.0875261
[21]	valid_0's binary_l

In [106]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.955, 0.648, 0.636, 0.642, 0.806


# set weight

In [107]:
from sklearn.utils.class_weight import compute_sample_weight

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    train_weight = compute_sample_weight(class_weight='balanced', y=y_train).astype('float32')
    lgb_train = lgb.Dataset(X_train, y_train, weight=train_weight)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's binary_logloss: 0.611383
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.543006
[3]	valid_0's binary_logloss: 0.49123
[4]	valid_0's binary_logloss: 0.446893
[5]	valid_0's binary_logloss: 0.404934
[6]	valid_0's binary_logloss: 0.373675
[7]	valid_0's binary_logloss: 0.348549
[8]	valid_0's binary_logloss: 0.324952
[9]	valid_0's binary_logloss: 0.303551
[10]	valid_0's binary_logloss: 0.283566
[11]	valid_0's binary_logloss: 0.264261
[12]	valid_0's binary_logloss: 0.249144
[13]	valid_0's binary_logloss: 0.234362
[14]	valid_0's binary_logloss: 0.221796
[15]	valid_0's binary_logloss: 0.212053
[16]	valid_0's binary_logloss: 0.203549
[17]	valid_0's binary_logloss: 0.194748
[18]	valid_0's binary_logloss: 0.18745
[19]	valid_0's binary_logloss: 0.18063
[20]	valid_0's binary_logloss: 0.174962
[21]	valid_0's binary_logloss

In [108]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.962, 0.653, 0.855, 0.74, 0.912


# undersampling

In [109]:
from imblearn.under_sampling import RandomUnderSampler

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    rus = RandomUnderSampler(random_state=0, replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's binary_logloss: 0.622577
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.564527
[3]	valid_0's binary_logloss: 0.516079
[4]	valid_0's binary_logloss: 0.474756
[5]	valid_0's binary_logloss: 0.439883
[6]	valid_0's binary_logloss: 0.410073
[7]	valid_0's binary_logloss: 0.383966
[8]	valid_0's binary_logloss: 0.36181
[9]	valid_0's binary_logloss: 0.340891
[10]	valid_0's binary_logloss: 0.322788
[11]	valid_0's binary_logloss: 0.307096
[12]	valid_0's binary_logloss: 0.293478
[13]	valid_0's binary_logloss: 0.281992
[14]	valid_0's binary_logloss: 0.271517
[15]	valid_0's binary_logloss: 0.262432
[16]	valid_0's binary_logloss: 0.254731
[17]	valid_0's binary_logloss: 0.247737
[18]	valid_0's binary_logloss: 0.241699
[19]	valid_0's binary_logloss: 0.237561
[20]	valid_0's binary_logloss: 0.234109
[21]	valid_0's binary_logloss: 0.231247
[22]	valid_0's binary_logloss: 0.228893
[23]	valid_

In [110]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.955, 0.6, 0.873, 0.711, 0.916


# oversampling

In [111]:
from imblearn.over_sampling import RandomOverSampler

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    ros = RandomOverSampler(random_state=0)
    X_train, y_train = ros.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's binary_logloss: 0.609241
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.539981
[3]	valid_0's binary_logloss: 0.482402
[4]	valid_0's binary_logloss: 0.436376
[5]	valid_0's binary_logloss: 0.395938
[6]	valid_0's binary_logloss: 0.361919
[7]	valid_0's binary_logloss: 0.332702
[8]	valid_0's binary_logloss: 0.306761
[9]	valid_0's binary_logloss: 0.284887
[10]	valid_0's binary_logloss: 0.264944
[11]	valid_0's binary_logloss: 0.247735
[12]	valid_0's binary_logloss: 0.23265
[13]	valid_0's binary_logloss: 0.220409
[14]	valid_0's binary_logloss: 0.20892
[15]	valid_0's binary_logloss: 0.198928
[16]	valid_0's binary_logloss: 0.191032
[17]	valid_0's binary_logloss: 0.183693
[18]	valid_0's binary_logloss: 0.177689
[19]	valid_0's binary_logloss: 0.172886
[20]	valid_0's binary_logloss: 0.167632
[21]	valid_0's binary_loglos

In [112]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.956, 0.623, 0.782, 0.694, 0.875


# SMOTE

In [113]:
from imblearn.over_sampling import SMOTE 

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    print(X_train.shape)
    print(y_train.shape)


    # prepare datasets
    sm = SMOTE(random_state=0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

(686, 35)
(686,)
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's binary_logloss: 0.610269
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.542206
[3]	valid_0's binary_logloss: 0.486102
[4]	valid_0's binary_logloss: 0.438518
[5]	valid_0's binary_logloss: 0.398705
[6]	valid_0's binary_logloss: 0.363962
[7]	valid_0's binary_logloss: 0.334631
[8]	valid_0's binary_logloss: 0.309274
[9]	valid_0's binary_logloss: 0.287525
[10]	valid_0's binary_logloss: 0.268888
[11]	valid_0's binary_logloss: 0.251575
[12]	valid_0's binary_logloss: 0.237423
[13]	valid_0's binary_logloss: 0.224776
[14]	valid_0's binary_logloss: 0.213418
[15]	valid_0's binary_logloss: 0.203728
[16]	valid_0's binary_logloss: 0.195517
[17]	valid_0's binary_logloss: 0.187679
[18]	valid_0's binary_logloss: 0.181673
[19]	valid_0's binary_logloss: 0.175203
[20]	valid_0's binary_logloss: 0.170709
[21]	vali

In [114]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.959, 0.661, 0.745, 0.701, 0.86


# ADASYN

In [115]:
from imblearn.over_sampling import ADASYN 

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    ad = ADASYN(random_state=0)
    X_train, y_train = ad.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's binary_logloss: 0.616827
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.548621
[3]	valid_0's binary_logloss: 0.492707
[4]	valid_0's binary_logloss: 0.445565
[5]	valid_0's binary_logloss: 0.405134
[6]	valid_0's binary_logloss: 0.371242
[7]	valid_0's binary_logloss: 0.342488
[8]	valid_0's binary_logloss: 0.316918
[9]	valid_0's binary_logloss: 0.295573
[10]	valid_0's binary_logloss: 0.27602
[11]	valid_0's binary_logloss: 0.258428
[12]	valid_0's binary_logloss: 0.241574
[13]	valid_0's binary_logloss: 0.227907
[14]	valid_0's binary_logloss: 0.21465
[15]	valid_0's binary_logloss: 0.203838
[16]	valid_0's binary_logloss: 0.194215
[17]	valid_0's binary_logloss: 0.186291
[18]	valid_0's binary_logloss: 0.17988
[19]	valid_0's binary_logloss: 0.174268
[20]	valid_0's binary_logloss: 0.169862
[21]	valid_0's binary_logloss

In [116]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.957, 0.636, 0.764, 0.694, 0.867


# undersampling + bagging


In [117]:
from imblearn.under_sampling import RandomUnderSampler

def undersample_bagg(X_train, X_eval, y_train, y_eval):
    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)

    return y_pred_proba

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    n_bagging = 10
    preds = [undersample_bagg(X_train, X_eval, y_train, y_eval) for i in range(n_bagging)]
    y_pred_proba = sum(preds) / n_bagging

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's binary_logloss: 0.619027
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.557893
[3]	valid_0's binary_logloss: 0.506747
[4]	valid_0's binary_logloss: 0.463496
[5]	valid_0's binary_logloss: 0.426621
[6]	valid_0's binary_logloss: 0.39666
[7]	valid_0's binary_logloss: 0.369024
[8]	valid_0's binary_logloss: 0.345117
[9]	valid_0's binary_logloss: 0.325844
[10]	valid_0's binary_logloss: 0.307509
[11]	valid_0's binary_logloss: 0.291536
[12]	valid_0's binary_logloss: 0.278899
[13]	valid_0's binary_logloss: 0.266488
[14]	valid_0's binary_logloss: 0.256931
[15]	valid_0's binary_logloss: 0.247228
[16]	valid_0's binary_logloss: 0.240006
[17]	valid_0's binary_logloss: 0.232385
[18]	valid_0's binary_logloss: 0.225734
[19]	valid_0's binary_logloss: 0.219931
[20]	valid_0's binary_logloss: 0.214873
[21]	valid_0's binary_logloss: 0.210674
[22]	valid_0's binary_logloss: 0.207203
[23]	valid_

In [118]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.962, 0.649, 0.873, 0.744, 0.92


# undersampling + bagging + calib


In [119]:
from imblearn.under_sampling import RandomUnderSampler

def calibrate(prob, beta):
    return prob / (prob + (1 - prob) / beta)

def undersample_bagg(X_train, X_eval, y_train, y_eval):
    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    # undersampling rate
    us_rate = sum(y_train == 1) / sum(y == 0)
    # calibrate probability
    y_pred_proba = calibrate(y_pred_proba, us_rate)

    return y_pred_proba

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    n_bagging = 10
    preds = [undersample_bagg(X_train, X_eval, y_train, y_eval) for i in range(n_bagging)]
    y_pred_proba = sum(preds) / n_bagging

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's binary_logloss: 0.620768
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.561986
[3]	valid_0's binary_logloss: 0.513535
[4]	valid_0's binary_logloss: 0.473176
[5]	valid_0's binary_logloss: 0.439306
[6]	valid_0's binary_logloss: 0.410732
[7]	valid_0's binary_logloss: 0.386541
[8]	valid_0's binary_logloss: 0.366022
[9]	valid_0's binary_logloss: 0.348604
[10]	valid_0's binary_logloss: 0.333825
[11]	valid_0's binary_logloss: 0.321308
[12]	valid_0's binary_logloss: 0.310735
[13]	valid_0's binary_logloss: 0.302801
[14]	valid_0's binary_logloss: 0.296275
[15]	valid_0's binary_logloss: 0.291844
[16]	valid_0's binary_logloss: 0.28733
[17]	valid_0's binary_logloss: 0.284612
[18]	valid_0's binary_logloss: 0.281588
[19]	valid_0's binary_logloss: 0.279258
[20]	valid_0's binary_logloss: 0.278267
[21]	valid_0's binary_logloss: 0.276858
[22]	valid_0's binary_logloss: 0.275896
[23]	valid_

In [120]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.95, 0.688, 0.4, 0.506, 0.694


# undersampling + calib


In [121]:
from imblearn.under_sampling import RandomUnderSampler

def calibrate(prob, beta):
    return prob / (prob + (1 - prob) / beta)

# stratified kfold split
kf = StratifiedKFold(n_splits=5)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
    }

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        valid_sets=lgb_eval,
                        # train up to 10000 rounds
                        num_boost_round=10000,
                        # if the score doesn't increase for 10 rounds, stop training
                        early_stopping_rounds=10)

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    # undersampling rate
    us_rate = sum(y_train == 1) / sum(y == 0)
    # calibrate probability
    y_pred_proba = calibrate(y_pred_proba, us_rate)

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's binary_logloss: 0.606292
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.535746
[3]	valid_0's binary_logloss: 0.477467
[4]	valid_0's binary_logloss: 0.428723
[5]	valid_0's binary_logloss: 0.387578
[6]	valid_0's binary_logloss: 0.352608
[7]	valid_0's binary_logloss: 0.322733
[8]	valid_0's binary_logloss: 0.297116
[9]	valid_0's binary_logloss: 0.275095
[10]	valid_0's binary_logloss: 0.256137
[11]	valid_0's binary_logloss: 0.239809
[12]	valid_0's binary_logloss: 0.225752
[13]	valid_0's binary_logloss: 0.213668
[14]	valid_0's binary_logloss: 0.208811
[15]	valid_0's binary_logloss: 0.199275
[16]	valid_0's binary_logloss: 0.196435
[17]	valid_0's binary_logloss: 0.194307
[18]	valid_0's binary_logloss: 0.192788
[19]	valid_0's binary_logloss: 0.191792
[20]	valid_0's binary_logloss: 0.191249
[21]	valid_0's binary_logloss: 0.191098
[22]	valid_0's binary_logloss: 0.191543
[23]	valid

In [122]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.952, 0.706, 0.436, 0.539, 0.712


In [127]:
def generate_markdown_table(txt):
    table = []
    n_cols = -1
    # for each line of table
    for line in score_txt.split('\n'):
        # number of columns
        n_cols = line.count(',') + 1
        row = '|'
        # iterate through elements in a line and make markdown table expression
        for elem in line.strip().split(','):
            row += elem.strip() + '|'
        table.append(row)

    # insert the separator between the header and the body
    table.insert(1, '|' + ':-|' * n_cols)
    table_txt = '\n'.join(table)
    print(table_txt)

score_txt = """condition, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
no treatment, 0.955, 0.648, 0.636, 0.642, 0.806
set weight, 0.962, 0.653, 0.855, 0.74, 0.912
undersampling, 0.955, 0.6, 0.873, 0.711, 0.916
oversampling, 0.956, 0.623, 0.782, 0.694, 0.875
SMOTE, 0.959, 0.661, 0.745, 0.701, 0.86
ADASYN, 0.957, 0.636, 0.764, 0.694, 0.867
undersampling + bagging, 0.962, 0.649, 0.873, 0.744, 0.92
undersampling + bagging + calib, 0.95, 0.688, 0.4, 0.506, 0.694
undersampling + calib, 0.952, 0.706, 0.436, 0.539, 0.712"""

generate_markdown_table(score_txt)

|condition|accuracy_score|precision_score|recall_score|f1_score|roc_auc_score|
|:-|:-|:-|:-|:-|:-|
|no treatment|0.955|0.648|0.636|0.642|0.806|
|set weight|0.962|0.653|0.855|0.74|0.912|
|undersampling|0.955|0.6|0.873|0.711|0.916|
|oversampling|0.956|0.623|0.782|0.694|0.875|
|SMOTE|0.959|0.661|0.745|0.701|0.86|
|ADASYN|0.957|0.636|0.764|0.694|0.867|
|undersampling + bagging|0.962|0.649|0.873|0.744|0.92|
|undersampling + bagging + calib|0.95|0.688|0.4|0.506|0.694|
|undersampling + calib|0.952|0.706|0.436|0.539|0.712|
