In [1]:
from sklearn.linear_model import LogisticRegression, ElasticNetCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, NearMiss
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from scipy.stats import chi2_contingency
import warnings
import os
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [2]:
train_path = os.path.join("..", "data", "input", "train_treated.csv")
val_path = os.path.join("..", "data", "input", "val_treated.csv")
test_path = os.path.join("..", "data", "input", "test_treated.csv")
y_path = os.path.join("..", "data", "input", "target.csv")
# this should be the one we use
# test_path = os.path.join("..", "data", "input", "y_bin.csv")

X_train = pd.read_csv(train_path, index_col=0)
X_val = pd.read_csv(val_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)
y = pd.read_csv(y_path, index_col=0)

In [3]:
X = pd.concat([X_train, X_val], axis=0)
X.head()

Unnamed: 0_level_0,race,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,average_pulse_bpm,discharge_disposition,...,med_glimepiride-pioglitazone,med_nateglinide,med_glipizide,med_insulin,med_metformin-pioglitazone,med_metformin-rosiglitazone,med_chlorpropamide,med_miglitol,med_repaglinide,med_tolbutamide
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
672135,Caucasian,Male,[90-100),MC,0,0,0,Emergency,93,Expired,...,0,0,0,1,0,0,0,0,0,0
794587,AfricanAmerican,Female,[70-80),Unknown,0,0,0,Emergency,74,Discharged to home,...,0,0,0,1,0,0,0,0,0,0
694232,Caucasian,Male,[80-90),MC,0,0,0,Emergency,83,Discharged to home,...,0,0,0,0,0,0,0,0,0,0
305869,AfricanAmerican,Male,[70-80),Unknown,0,0,3,Emergency,67,Discharged to home,...,0,0,0,0,0,0,0,0,0,0
181753,Caucasian,Female,[70-80),MC,0,0,0,Emergency,126,Discharged/transferred to another rehab fac in...,...,0,0,1,0,0,0,0,0,0,0


In [4]:
y = y.reindex(X.index)
y.head()

Unnamed: 0_level_0,readmitted_binary
encounter_id,Unnamed: 1_level_1
672135,No
794587,No
694232,No
305869,No
181753,No


## balancing the dataset

after some research found there are three main categories to balance data, __undersampling__, __oversampling__, and a __combination__ of both.

will test all of them to decide which will be the best method to use

In [27]:
def evaluate(balancer):
    
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    model = GradientBoostingClassifier(
        verbose=1,
        random_state = 42
    )
    
    score_train = []
    score_test = []
    timer = []
    f1_s = []
    
    for train_index, test_index in skf.split(X, y, ):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        X_res, y_res = balancer.fit_resample(X_train, y_train)
        
        # start counting time
        begin = time.perf_counter()
        # fit the model to the data
        model.fit(X_res, y_res)
        # finish counting time
        end = time.perf_counter()
        # check the mean accuracy for the train
        value_train = model.score(X_res, y_res)
        # check the mean accuracy for the test
        value_test = model.score(X_val, y_val)
        # check the f1 score
        y_pred = model.predict(X_val)
        value_f1 = f1_score(y_val, y_pred, pos_label='Yes')
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_test.append(value_test)
        timer.append(end-begin)
        f1_s.append(value_f1)
    # calculate the average and the std for each measure (accuracy, time and number of iterations)
    avg_time = round(np.mean(timer), 3)
    avg_train = round(np.mean(score_train), 3)
    avg_test = round(np.mean(score_test), 3)
    std_time = round(np.std(timer), 2)
    std_train = round(np.std(score_train), 2)
    std_test = round(np.std(score_test), 2)
    avg_f1 = round(np.mean(f1_s), 6)
    std_f1 = round(np.std(f1_s), 6)

    return str(avg_time) + '+/-' + str(std_time), str(avg_train) + '+/-' + str(std_train), \
        str(avg_test) + '+/-' + str(std_test), str(avg_f1) + '+/-' + str(std_f1)

def show_results(df, *args):
    for i, arg in enumerate(args):
        time, avg_train, avg_test, f1 = evaluate(arg)
        df.iloc[i] = time, avg_train, avg_test, f1
    return df

In [8]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=[
                             'Tomek', 'ENN', 'NearMiss'])

results_under = show_results(results_empty,
                       TomekLinks(n_jobs=4),
                       EditedNearestNeighbours(n_neighbors=5, kind_sel='all', n_jobs=4),
                       NearMiss(n_jobs=4))

results_under

      Iter       Train Loss   Remaining Time 
         1           0.7076           34.58s
         2           0.7015           34.37s
         3           0.6969           34.16s
         4           0.6931           33.77s
         5           0.6901           33.21s
         6           0.6877           33.05s
         7           0.6857           32.74s
         8           0.6840           32.36s
         9           0.6825           32.36s
        10           0.6812           32.09s
        20           0.6733           28.23s
        30           0.6690           24.58s
        40           0.6664           20.90s
        50           0.6642           17.47s
        60           0.6622           13.94s
        70           0.6606           10.48s
        80           0.6594            6.98s
        90           0.6583            3.48s
       100           0.6571            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.7078           34.43s
        

KeyboardInterrupt: 

In [27]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=[
                             'SMOTE', 'B-SMOTE', 'ADASYN'])

results_over = show_results(results_empty,
                       SMOTE(random_state=69),
                       BorderlineSMOTE(random_state=69),
                       ADASYN(random_state=69))

results_over

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

Unnamed: 0,Time,Train,Test,f1
SMOTE,52.505+/-3.04,0.754+/-0.0,0.729+/-0.01,0.236388+/-0.009103
B-SMOTE,56.468+/-4.54,0.759+/-0.0,0.728+/-0.0,0.243142+/-0.009321
ADASYN,53.848+/-2.01,0.751+/-0.0,0.732+/-0.01,0.232769+/-0.009269


In [31]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=['SMOTEENN', 'SMOTETomek'])

results_comb = show_results(results_empty,
                       SMOTEENN(random_state=69, n_jobs=4),
                       SMOTETomek(random_state=69, n_jobs=4))

results_comb

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Unnamed: 0,Time,Train,Test,f1
SMOTEENN,2.217+/-0.21,0.781+/-0.0,0.508+/-0.0,0.231324+/-0.005761
SMOTETomek,2.741+/-1.44,0.723+/-0.0,0.659+/-0.01,0.220391+/-0.009603


In [32]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=[
                             'ENN03', 'ENN05', 'ENN07'])

results_enn = show_results(results_empty,
                       EditedNearestNeighbours(n_neighbors=3, kind_sel='all', n_jobs=4),
                       EditedNearestNeighbours(n_neighbors=5, kind_sel='all', n_jobs=4),
                       EditedNearestNeighbours(n_neighbors=7, kind_sel='all', n_jobs=4))

results_enn

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Unnamed: 0,Time,Train,Test,f1
ENN03,0.541+/-0.02,0.856+/-0.0,0.88+/-0.0,0.127599+/-0.015625
ENN05,0.429+/-0.01,0.835+/-0.0,0.867+/-0.0,0.186579+/-0.018787
ENN07,0.34+/-0.01,0.814+/-0.0,0.846+/-0.0,0.224199+/-0.011278


In [42]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=[
                             'ENNmaj', 'ENNnotmaj', 'ENN'])

results_enn = show_results(results_empty,
                       EditedNearestNeighbours(n_neighbors=5, sampling_strategy='majority' ,kind_sel='all', n_jobs=4),
                       EditedNearestNeighbours(n_neighbors=5, sampling_strategy='not majority',kind_sel='all', n_jobs=4),
                       EditedNearestNeighbours(n_neighbors=7, kind_sel='all', n_jobs=4))

results_enn

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Unnamed: 0,Time,Train,Test,f1
ENNmaj,0.435+/-0.02,0.835+/-0.0,0.867+/-0.0,0.187795+/-0.014603
ENNnotmaj,0.56+/-0.08,1.0+/-0.0,0.888+/-0.0,0.0+/-0.0
ENN,0.343+/-0.01,0.814+/-0.0,0.846+/-0.0,0.22323+/-0.01518


ENN 07 the best option
would be nice to see how smotenk performs when we reduce the features

In [5]:
balancer = EditedNearestNeighbours(n_neighbors=7, kind_sel='all', n_jobs=4)
X_balanced, y_balanced = balancer.fit_resample(X_train, y.loc[X_train.index])

X_balanced.shape

(31151, 93)

<span style="color: red">TEST 1:</span> just with random oversampler

In [5]:
balancer = RandomOverSampler(random_state=69)
X_balanced, y_balanced = balancer.fit_resample(X_train, y.loc[X_train.index])

X_balanced.shape

(101256, 67)

In [6]:
rows = balancer.sample_indices_
og_indices = X_train.iloc[rows].index

X_balanced.set_index(og_indices, inplace=True)

X_balanced.head()

Unnamed: 0_level_0,race_0,race_1,race_2,race_3,gender,age_0,age_1,age_2,age_3,age_4,...,med_acetohexamide,med_metformin-rosiglitazone,med_metformin,med_troglitazone,med_miglitol,med_glipizide,med_glimepiride-pioglitazone,med_repaglinide,med_glyburide-metformin,age_mean
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
672135,-0.632456,0.534522,-0.316228,0.119523,1.0,-0.495434,0.522233,-0.453425,0.336581,-0.214834,...,0,0,0,0,0,0,0,0,0,1.82337
181753,-0.632456,0.534522,-0.316228,0.119523,0.0,-0.385337,0.174078,0.151142,-0.411377,0.50128,...,0,0,0,0,0,1,0,0,0,0.547373
890706,-0.632456,0.534522,-0.316228,0.119523,0.0,-0.275241,-0.087039,0.377854,-0.317882,-0.035806,...,0,0,0,0,0,0,0,0,0,1.185371
648403,-0.632456,0.534522,-0.316228,0.119523,0.0,-0.055048,-0.348155,0.12955,0.336581,-0.214834,...,0,0,0,0,0,0,0,0,0,-1.366623
947413,-0.316228,-0.267261,0.632456,-0.478091,1.0,-0.055048,-0.348155,0.12955,0.336581,-0.214834,...,0,0,0,0,0,0,0,0,0,-1.366623


In [7]:
# this to compare if the rows that we have are still the same

pd.DataFrame.compare(X_balanced, X_train.iloc[rows])

NameError: name 'rows' is not defined

In [8]:
y_balanced.set_index(og_indices, inplace=True)
pd.DataFrame.compare(y_balanced, y.iloc[rows])

encounter_id


In [33]:
y_new = pd.concat([y_balanced, y.loc[X_val.index]], axis=0)

In [39]:
y_new[y_new.index.duplicated()].shape

(21, 1)

In [11]:
y_new.shape

(115504, 1)

In [6]:
X_balanced.to_csv("../data/input/train_balanced.csv")

In [7]:
y_balanced.to_csv("../data/input/y_train.csv")

In [15]:
X_val.tail()

Unnamed: 0_level_0,race,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,average_pulse_bpm,discharge_disposition,...,med_glimepiride-pioglitazone,med_nateglinide,med_glipizide,med_insulin,med_metformin-pioglitazone,med_metformin-rosiglitazone,med_chlorpropamide,med_miglitol,med_repaglinide,med_tolbutamide
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
181659,Caucasian,Male,[50-60),HM,0,0,0,Elective,89,Discharged to home,...,0,0,0,1,0,0,0,0,0,0
332892,AfricanAmerican,Female,[70-80),Unknown,0,0,1,Emergency,103,Discharged/transferred to home with home healt...,...,0,0,0,0,0,0,0,0,0,0
925848,Caucasian,Male,[60-70),Unknown,0,0,1,Emergency,127,Discharged/transferred to another rehab fac in...,...,0,0,0,1,0,0,0,0,0,0
869407,Caucasian,Female,[70-80),MC,0,0,0,Emergency,72,Discharged to home,...,0,0,1,0,0,0,0,0,0,0
988159,Caucasian,Male,[80-90),SP,0,0,2,Emergency,106,Hospice / medical facility,...,0,0,0,1,0,0,0,0,0,0
