In [1]:
import warnings
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler, MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from category_encoders import MEstimateEncoder, OneHotEncoder, PolynomialEncoder, LeaveOneOutEncoder
import os
import numpy as np
import pandas as pd
import time

warnings.filterwarnings('ignore')

In [2]:
train_path = os.path.join("..", "data", "input", "train_balanced.csv")
val_path = os.path.join("..", "data", "input", "val_treated.csv")
test_path = os.path.join("..", "data", "input", "test_treated.csv")

X = pd.read_csv(train_path, index_col=0)
X_val = pd.read_csv(val_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [3]:
X.shape

(101256, 67)

In [4]:
y = pd.read_csv("../data/input/y_train.csv", index_col=0)
y_val = pd.read_csv("../data/input/y_train.csv", index_col=0)

In [5]:
numerical_features = X.select_dtypes(include='number').columns.tolist()
metric_features = [
    feature for feature in numerical_features if not feature.startswith('med_')]
metric_features

['outpatient_visits_in_previous_year',
 'emergency_visits_in_previous_year',
 'inpatient_visits_in_previous_year',
 'average_pulse_bpm',
 'length_of_stay_in_hospital',
 'number_lab_tests',
 'non_lab_procedures',
 'number_of_medications',
 'number_diagnoses',
 'age_mean',
 'outpatient_visits_in_previous_year_log',
 'emergency_visits_in_previous_year_log',
 'inpatient_visits_in_previous_year_log',
 'length_of_stay_in_hospital_log',
 'non_lab_procedures_log',
 'number_of_medications_log',
 'number_diagnoses_log',
 'age_mean_log',
 'outpatient_visits_in_previous_year_win_log',
 'emergency_visits_in_previous_year_win_log',
 'inpatient_visits_in_previous_year_win_log',
 'length_of_stay_in_hospital_win_log',
 'non_lab_procedures_win_log',
 'number_of_medications_win_log',
 'number_diagnoses_win_log',
 'age_mean_win_log']

In [6]:
X[metric_features].describe()

Unnamed: 0,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,average_pulse_bpm,length_of_stay_in_hospital,number_lab_tests,non_lab_procedures,number_of_medications,number_diagnoses,age_mean,...,number_diagnoses_log,age_mean_log,outpatient_visits_in_previous_year_win_log,emergency_visits_in_previous_year_win_log,inpatient_visits_in_previous_year_win_log,length_of_stay_in_hospital_win_log,non_lab_procedures_win_log,number_of_medications_win_log,number_diagnoses_win_log,age_mean_win_log
count,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,...,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0,101256.0
mean,0.402672,0.267698,0.912588,99.48405,4.556589,43.637098,1.310836,16.416123,7.559977,63.058485,...,42.912626,28.515821,-519.179402,-6152.569422,-21.604492,1.44063,-3.937196,4.345419,89.89079,28.515821
std,1.289343,1.048429,1.657842,23.09767,3.021382,19.476407,1.672679,8.06873,1.865646,21.170647,...,19.300251,8.937346,243.803108,2465.086585,18.414027,0.829283,5.082118,1.166488,43.725838,8.937346
min,0.0,0.0,0.0,60.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1e-05,-1.321563,-633.581196,-7140.181479,-36.948214,1e-05,-9.466973,1.318258,1.992565,-1.321563
25%,0.0,0.0,0.0,79.0,2.0,32.0,0.0,11.0,6.0,55.0,...,23.5761,26.083831,-633.581196,-7140.181479,-36.948214,0.725569,-9.466973,3.617475,43.252203,26.083831
50%,0.0,0.0,0.0,99.0,4.0,45.0,1.0,15.0,9.0,65.0,...,58.6348,29.775903,-633.581196,-7140.181479,-36.948214,1.520038,1e-05,4.323965,127.467979,29.775903
75%,0.0,0.0,1.0,119.0,6.0,57.0,2.0,20.0,9.0,75.0,...,58.6348,33.331658,-633.581196,-7140.181479,1e-05,2.019305,0.701671,5.045497,127.467979,33.331658
max,42.0,76.0,21.0,139.0,14.0,118.0,6.0,75.0,16.0,95.0,...,211.392651,40.117865,1.105174,0.750116,1.538926,3.152212,1.84943,7.325427,127.467979,40.117865


In [7]:
cat_features = [
    feature for feature in X.columns if feature not in numerical_features]
cat_features

['race',
 'gender',
 'age',
 'payer_code',
 'admission_type',
 'discharge_disposition',
 'admission_source',
 'glucose_test_result',
 'a1c_test_result',
 'change_in_meds_during_hospitalization',
 'prescribed_diabetes_meds',
 'is_outpatient_visited',
 'is_emergency_visited',
 'is_inpatient_visited',
 'is_pulse_normal',
 'primary_diagnosis_cat',
 'secondary_diagnosis_cat',
 'additional_diagnosis_cat',
 'discharge_disposition_cat',
 'admission_source_cat']

In [8]:
def avg_score(scaler):
    # apply kfold
    skf = StratifiedKFold(n_splits=10)
    model = DecisionTreeClassifier(
        criterion='gini',
        max_depth=15,
        random_state=42
    )
    # create lists to store the results from the different models
    score_train = []
    score_test = []
    timer = []
    f1_s = []

    for train_index, test_index in skf.split(X, y):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = X[metric_features].iloc[train_index], X[metric_features].iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        scale = scaler.fit(X_train)
        # Transform your train data by applying the scale obtained in the previous command
        scaled_X_train = scale.transform(X_train)
        # Transform your validation data by applying the scale obtained in the first command
        scaled_X_val = scale.transform(X_val)

        oe = OrdinalEncoder()

        X_cat_train = oe.fit_transform(X[cat_features].iloc[train_index])
        X_cat_val = oe.fit_transform(X[cat_features].iloc[test_index])

        scaled_X_train = np.concatenate((scaled_X_train, X_cat_train), axis=1)
        scaled_X_val = np.concatenate((scaled_X_val, X_cat_val), axis=1)

        # start counting time
        begin = time.perf_counter()
        # fit the model to the data
        model.fit(scaled_X_train, y_train)
        # finish counting time
        end = time.perf_counter()
        # check the mean accuracy for the train
        value_train = model.score(scaled_X_train, y_train)
        # check the mean accuracy for the test
        value_test = model.score(scaled_X_val, y_val)
        # check the f1 score
        y_pred = model.predict(scaled_X_val)
        value_f1 = f1_score(y_val, y_pred, pos_label='Yes')
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_test.append(value_test)
        timer.append(end-begin)
        f1_s.append(value_f1)
    # calculate the average and the std for each measure (accuracy, time and number of iterations)
    avg_time = round(np.mean(timer), 3)
    avg_train = round(np.mean(score_train), 3)
    avg_test = round(np.mean(score_test), 3)
    std_time = round(np.std(timer), 2)
    std_train = round(np.std(score_train), 2)
    std_test = round(np.std(score_test), 2)
    avg_f1 = round(np.mean(f1_s * 100), 6)
    std_f1 = round(np.std(f1_s * 100), 6)

    return str(avg_time) + '+/-' + str(std_time), str(avg_train) + '+/-' + str(std_train), \
        str(avg_test) + '+/-' + str(std_test), str(avg_f1) + '+/-' + str(std_f1)


def show_results(df, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        time, avg_train, avg_test, f1 = avg_score(arg)
        # store the results in the right row
        df.iloc[count] = time, avg_train, avg_test, f1
        count += 1
    return df

In [9]:
results_empty = pd.DataFrame(columns=['Time', 'Train', 'Test', 'f1'], index=[
                             'MinMax[0, 1]', 'MinMax[-1, 1]', 'StandardScaler', 'Robust', 'Abs'])

results = show_results(results_empty,
                       MinMaxScaler(feature_range=(0, 1)),
                       MinMaxScaler(feature_range=(-1, 1)),
                       StandardScaler(),
                       RobustScaler(),
                       MaxAbsScaler())

results

Unnamed: 0,Time,Train,Test,f1
"MinMax[0, 1]",2.559+/-0.39,0.902+/-0.0,0.856+/-0.03,0.05501+/-0.028038
"MinMax[-1, 1]",2.697+/-0.05,0.902+/-0.0,0.856+/-0.03,0.055138+/-0.028032
StandardScaler,2.238+/-0.62,0.902+/-0.0,0.856+/-0.03,0.055137+/-0.028028
Robust,1.575+/-0.63,0.902+/-0.0,0.856+/-0.03,0.054965+/-0.028023
Abs,2.688+/-0.05,0.902+/-0.0,0.856+/-0.03,0.055127+/-0.028027


since the best results where form `MinMaxScaler[-1, 1]`, we will use that scaler


In [62]:
X_train_scaled = X.copy()
X_val_scaled = X_val.copy()
test_scaled = test.copy()

standard_scaler = MinMaxScaler(feature_range=(-1, 1))

X_train_scaled[metric_features] = standard_scaler.fit_transform(
    X[metric_features])
X_val_scaled[metric_features] = standard_scaler.transform(
    X_val[metric_features])
test_scaled[metric_features] = standard_scaler.transform(test[metric_features])

## encoding


### research

after a research found this options the best for the following type of variables:

- **binary**: binary encoder (dah!) <br/>
- **imbalanced binary**: m-estimate encoder (probably test vs binary) <br/>
- **low cardinallity**: one hot encoding vs target encoding (wins if var is corr with target variable), polynominal encoder worth looking <br/>
- **high cardinallity**: base n encoder (potentially leave one out) <br/>


now we will classify each variable


In [63]:
def describe_categorical(features, dataframe):
    # Initialize lists to store data for each column in the report
    feature_list = []
    mode_list = []
    mode_freq_list = []
    mode_prop_list = []
    second_mode_list = []
    second_mode_freq_list = []
    second_mode_prop_list = []
    missing_val_percent_list = []
    cardinality_list = []

    for feature in features:
        # Calculate mode, 2nd mode and their frequencies
        mode = dataframe[feature].mode()[0]
        mode_freq = dataframe[feature].value_counts().iloc[0]
        mode_prop = mode_freq / len(dataframe)
        second_mode = dataframe[feature].value_counts().index[1] if len(
            dataframe[feature].value_counts()) > 1 else 'N/A'
        second_mode_freq = dataframe[feature].value_counts().iloc[1] if len(
            dataframe[feature].value_counts()) > 1 else 0
        second_mode_prop = second_mode_freq / len(dataframe)

        # Calculate missing values percentage and cardinality
        missing_val_percent = dataframe[feature].isna().mean() * 100
        cardinality = dataframe[feature].nunique()

        # Append to lists
        feature_list.append(feature)
        mode_list.append(mode)
        mode_freq_list.append(mode_freq)
        mode_prop_list.append(mode_prop)
        second_mode_list.append(second_mode)
        second_mode_freq_list.append(second_mode_freq)
        second_mode_prop_list.append(second_mode_prop)
        missing_val_percent_list.append(missing_val_percent)
        cardinality_list.append(cardinality)

    # Create the DataFrame
    categorical_data_quality_report = pd.DataFrame({
        'Feature': feature_list,
        'Mode': mode_list,
        'Mode Frequency': mode_freq_list,
        'Mode Proportion': mode_prop_list,
        '2nd Mode': second_mode_list,
        '2nd Mode Frequency': second_mode_freq_list,
        '2nd Mode Proportion': second_mode_prop_list,
        'Missing Values %': missing_val_percent_list,
        'Cardinality': cardinality_list
    })

    return categorical_data_quality_report.sort_values(by=['Mode Proportion', 'Missing Values %'], ascending=False)

In [64]:
cat_info = describe_categorical(cat_features, X).set_index("Feature")
cat_info.sort_values('Cardinality')

Unnamed: 0_level_0,Mode,Mode Frequency,Mode Proportion,2nd Mode,2nd Mode Frequency,2nd Mode Proportion,Missing Values %,Cardinality
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
is_emergency_visited,False,87251,0.861687,True,14005,0.138313,0.0,2
is_outpatient_visited,False,82984,0.819546,True,18272,0.180454,0.0,2
prescribed_diabetes_meds,Yes,79416,0.784309,No,21840,0.215691,0.0,2
is_inpatient_visited,False,59753,0.590118,True,41503,0.409882,0.0,2
is_pulse_normal,True,51860,0.512167,False,49396,0.487833,0.0,2
gender,Female,54841,0.541607,Male,46415,0.458393,0.0,2
change_in_meds_during_hospitalization,No,52971,0.523139,Ch,48285,0.476861,0.0,2
glucose_test_result,none,95748,0.945603,Norm,2651,0.026181,0.0,4
a1c_test_result,none,85225,0.841679,>8,7679,0.075837,0.0,4
race,Caucasian,79571,0.78584,AfricanAmerican,18010,0.177866,0.0,5


In [65]:
bool_features = cat_info[(cat_info['Cardinality'] == 2) & (
    cat_info['Mode Proportion'] <= 0.6)].index.tolist()
imb_bool_features = cat_info[(cat_info['Cardinality'] == 2) & (
    cat_info['Mode Proportion'] > 0.6)].index.tolist()
low_card_features = cat_info[(cat_info['Cardinality'] > 2) & (
    cat_info['Cardinality'] <= 10)].index.tolist()
high_card_features = cat_info[cat_info['Cardinality'] > 10].index.tolist()

print(bool_features)
print(imb_bool_features)
print(low_card_features)
print(high_card_features)

['is_inpatient_visited', 'gender', 'change_in_meds_during_hospitalization', 'is_pulse_normal']
['is_emergency_visited', 'is_outpatient_visited', 'prescribed_diabetes_meds']
['glucose_test_result', 'a1c_test_result', 'race', 'discharge_disposition_cat', 'admission_source_cat', 'admission_type', 'age']
['admission_source', 'discharge_disposition', 'payer_code', 'secondary_diagnosis_cat', 'primary_diagnosis_cat', 'additional_diagnosis_cat']


#### bool encoding


In [66]:
oe = OrdinalEncoder()

X_train_encoded = X_train_scaled.copy()
X_val_encoded = X_val_scaled.copy()
test_encoded = test_scaled.copy()

X_train_encoded[bool_features] = oe.fit_transform(
    X_train_encoded[bool_features])
X_val_encoded[bool_features] = oe.transform(X_val_encoded[bool_features])
test_encoded[bool_features] = oe.transform(test_encoded[bool_features])

In [67]:
X_train_encoded[cat_features].head()

Unnamed: 0,race,gender,age,payer_code,admission_type,discharge_disposition,admission_source,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,is_outpatient_visited,is_emergency_visited,is_inpatient_visited,is_pulse_normal,primary_diagnosis_cat,secondary_diagnosis_cat,additional_diagnosis_cat,discharge_disposition_cat,admission_source_cat
0,Caucasian,1.0,[90-100),MC,Emergency,Expired,Emergency Room,none,none,0.0,Yes,False,False,0.0,1.0,Diseases of the Respiratory System,Diseases of the Circulatory System,Diseases of the Circulatory System,Other,Emergency
1,AfricanAmerican,0.0,[70-80),Unknown,Emergency,Discharged to home,Emergency Room,none,none,0.0,Yes,False,False,0.0,1.0,Diseases of the Respiratory System,"Endocrine, Nutritional and Metabolic Diseases,...",Supplemental Classification of Factors Influen...,Home,Emergency
2,Caucasian,1.0,[80-90),MC,Emergency,Discharged to home,Emergency Room,none,none,1.0,No,False,False,0.0,1.0,Neoplasms,Diseases of the Digestive System,Infectious and Parasitic Diseases,Home,Emergency
3,AfricanAmerican,1.0,[70-80),Unknown,Emergency,Discharged to home,Emergency Room,none,none,1.0,No,False,False,1.0,1.0,Mental Disorders,Diseases of the Circulatory System,Diseases of the Blood and Blood-forming Organs,Home,Emergency
4,Caucasian,0.0,[70-80),MC,Emergency,Discharged/transferred to another rehab fac in...,Emergency Room,none,none,0.0,Yes,False,False,0.0,0.0,Injury and Poisoning,Injury and Poisoning,Diseases of the Respiratory System,Facility,Emergency


In [68]:
# X_encoded = pd.concat([X_train_encoded, X_val_encoded], axis=0)

# y = y.reindex(X_encoded.index)

In [69]:
y["readmitted_binary"] = oe.fit_transform(y)
y.head()

Unnamed: 0,readmitted_binary
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [70]:
X_res = X_train_encoded.copy()

not_imb_features = [
    feature for feature in cat_features if feature not in imb_bool_features]
X_res[not_imb_features] = oe.fit_transform(X_train_encoded[not_imb_features])
X_res[imb_bool_features].head()

Unnamed: 0,is_emergency_visited,is_outpatient_visited,prescribed_diabetes_meds
0,False,False,Yes
1,False,False,Yes
2,False,False,No
3,False,False,No
4,False,False,Yes


In [51]:
for m_value in [0.1, 0.5, 1.0, 2.0]:
    
    m_estimate_encoder = MEstimateEncoder(cols=imb_bool_features, m=m_value)
    X_res = m_estimate_encoder.fit_transform(X_res, y)

    model = GaussianNB()
    scores = cross_val_score(model, X_res, y, cv=10, scoring='f1')

    print(f'm = {m_value}, Mean f1: {scores.mean()}')

m = 0.1, Mean f1: 0.666129696402761
m = 0.5, Mean f1: 0.666129696402761
m = 1.0, Mean f1: 0.666129696402761
m = 2.0, Mean f1: 0.666129696402761


In [71]:
# for imbalanced features
mee = MEstimateEncoder(verbose=1, cols=imb_bool_features, m=1)

X_train_encoded[imb_bool_features] = mee.fit_transform(
    X_train_encoded[imb_bool_features], y.loc[X_train_encoded.index])
X_val_encoded[imb_bool_features] = mee.transform(
    X_val_encoded[imb_bool_features])
test_encoded[imb_bool_features] = mee.transform(
    test_encoded[imb_bool_features])

In [72]:
X_train_encoded[imb_bool_features + bool_features].head()

Unnamed: 0,is_emergency_visited,is_outpatient_visited,prescribed_diabetes_meds,is_inpatient_visited,gender,change_in_meds_during_hospitalization,is_pulse_normal
0,0.481634,0.486817,0.510967,0.0,1.0,0.0,1.0
1,0.481634,0.486817,0.510967,0.0,0.0,0.0,1.0
2,0.481634,0.486817,0.460121,0.0,1.0,1.0,1.0
3,0.481634,0.486817,0.460121,1.0,1.0,1.0,1.0
4,0.481634,0.486817,0.510967,0.0,0.0,0.0,0.0


#### low and high cardinality encoding


In [73]:
X_res_low_car = X_train_encoded.copy()

X_res_low_car[high_card_features] = oe.fit_transform(X_res_low_car[high_card_features])

ohe = OneHotEncoder(cols=low_card_features, verbose=1,
                    use_cat_names=True, handle_unknown='value')
pe = PolynomialEncoder(cols=low_card_features, verbose=1)

for encoder in [ohe, pe]:
    res = encoder.fit_transform(X_res_low_car, y)

    model = GaussianNB()
    scores = cross_val_score(model, res, y, cv=10, scoring='f1')

    print(f'm = {type(encoder).__name__}, Mean F1: {scores.mean()}')

m = OneHotEncoder, Mean F1: 0.6671245756265924
m = PolynomialEncoder, Mean F1: 0.6661120635505406


In [74]:
pe = PolynomialEncoder(verbose=1, cols=low_card_features)

X_train_encoded = pe.fit_transform(
    X_train_encoded, y.loc[X_train_encoded.index])
X_val_encoded = pe.transform(X_val_encoded)
test_encoded = pe.transform(test_encoded)

In [75]:
X_train_encoded.drop(['intercept'], axis=1, inplace=True)
X_val_encoded.drop(['intercept'], axis=1, inplace=True)
test_encoded.drop(['intercept'], axis=1, inplace=True)

In [78]:
X_res_high_car = X_train_encoded.copy()

looe05 = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.05)
looe10 = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.10)
looe25 = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.25)
looe50 = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.50)
looe60 = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.60)
pe = PolynomialEncoder(cols=high_card_features, verbose=1)
ohe = OneHotEncoder(cols=high_card_features, verbose=1,
                    use_cat_names=True, handle_unknown='value')

for encoder in [looe05, looe10, looe25, looe50, looe60, pe, ohe]:
    res = encoder.fit_transform(X_res_high_car, y)

    model = LogisticRegression()
    scores = cross_val_score(model, res, y, cv=10, scoring='f1')

    print(f'm = {type(encoder).__name__}, Mean Accuracy: {scores.mean()}')

m = LeaveOneOutEncoder, Mean Accuracy: 0.6106362679443051
m = LeaveOneOutEncoder, Mean Accuracy: 0.6065789993555157
m = LeaveOneOutEncoder, Mean Accuracy: 0.6035795578998762
m = LeaveOneOutEncoder, Mean Accuracy: 0.6056759934463837
m = LeaveOneOutEncoder, Mean Accuracy: 0.6071079650727065
m = PolynomialEncoder, Mean Accuracy: 0.6142000124496885
m = OneHotEncoder, Mean Accuracy: 0.6148287711292296


In [79]:
looe = LeaveOneOutEncoder(cols=high_card_features, verbose=1, sigma=0.05)

X_train_encoded = looe.fit_transform(
    X_train_encoded, y.loc[X_train_encoded.index])
X_val_encoded = looe.transform(X_val_encoded)
test_encoded = looe.transform(test_encoded)

In [80]:
X_train_encoded.to_csv("../data/input/train_encoded.csv")
X_val_encoded.to_csv("../data/input/val_encoded.csv")
test_encoded.to_csv("../data/input/test_encoded.csv")
y.to_csv("../data/input/y_bin.csv")