In [2]:
import warnings
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LassoCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector, RFE, SelectKBest, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from scipy.stats import boxcox, chi2_contingency
from scipy.stats.mstats import winsorize
from imblearn.over_sampling import BorderlineSMOTE
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

sns.set()
sns.set_palette('cividis')

warnings.filterwarnings('ignore')

In [3]:
train_path = os.path.join("..", "data", "input", "train_final.csv")
val_path = os.path.join("..", "data", "input", "val_final.csv")
test_path = os.path.join("..", "data", "input", "test_final.csv")
y_path = os.path.join("..", "data", "input", "y_bin.csv")

X_train = pd.read_csv(train_path, index_col=0)
X_val = pd.read_csv(val_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)
y = pd.read_csv(y_path, index_col=0)

In [4]:
X_train.shape[0] + X_val.shape[0] == y.shape[0]

True

In [5]:
X_train.head()

Unnamed: 0_level_0,age_mean,non_lab_procedures.1,inpatient_visits_in_previous_year.1,average_pulse_bpm,emergency_visits_in_previous_year.1,number_diagnoses,number_of_medications_log,length_of_stay_in_hospital_log,number_lab_tests.1,race_0,...,is_emergency_visited,is_inpatient_visited,discharge_disposition_cat_0,admission_source_cat_0,admission_source_cat_1,admission_source_cat_3,med_glimepiride,med_insulin,med_repaglinide,med_metformin
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
672135,1.82337,1.558432,-0.54338,-0.287912,-0.316412,0.813604,1.728691,-0.249231,1.022697,-0.632456,...,0.104627,0.083694,-0.6324555,-0.597614,0.545545,0.188982,0,1,0,0
181753,0.547373,-0.198828,-0.54338,1.143652,-0.316412,-0.735636,0.162575,0.156037,-0.154945,-0.632456,...,0.104627,0.083694,-5.663831e-17,-0.597614,0.545545,0.188982,0,0,0,0
890706,1.185371,-0.198828,-0.54338,0.623083,-0.316412,0.813604,-0.23875,0.481067,-1.281386,-0.632456,...,0.104627,0.083694,-0.3162278,-0.358569,-0.109109,-0.566947,0,1,0,0
648403,-1.366623,0.972679,0.328783,-1.372431,-0.316412,0.813604,0.285226,-0.795123,-1.742202,-0.632456,...,0.104627,0.166293,-0.3162278,0.119523,-0.436436,0.377964,0,0,0,0
947413,-1.366623,-0.784582,-0.54338,0.970129,-0.316412,-1.252049,-1.279244,0.156037,-0.00134,-0.316228,...,0.104627,0.083694,-0.3162278,-0.597614,0.545545,0.188982,0,1,0,0


In [6]:
y_train = y.loc[X_train.index]
y_val = y.loc[X_val.index]

In [7]:
X = pd.concat([X_train, X_val], axis=0)
X.shape

(45399, 33)

## Random Forest

In [10]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_train_pred_rfc = rfc.predict(X_train)
y_pred_rfc = rfc.predict(X_val)

print(f"Train: Accuracy: {accuracy_score(y_train, y_train_pred_rfc)}, f1: {f1_score(y_train, y_train_pred_rfc)}")
print(f"Test: Accuracy: {accuracy_score(y_val, y_pred_rfc)}, f1: {f1_score(y_val, y_pred_rfc)}")

Train: Accuracy: 1.0, f1: 1.0
Test: Accuracy: 0.839135317237507, f1: 0.2219959266802444


### parameter tuning

In [30]:
def evaluate(model):
    
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    score_train = []
    score_test = []
    f1_s = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        value_train = accuracy_score(y_train, y_train_pred)
        value_test = accuracy_score(y_val, y_val_pred)
        value_f1 = f1_score(y_val, y_val_pred)
        
        score_train.append(value_train)
        score_test.append(value_test)
        f1_s.append(value_f1)
        
    avg_train = round(np.mean(score_train), 3)
    avg_test = round(np.mean(score_test), 3)
    std_train = round(np.std(score_train), 2)
    std_test = round(np.std(score_test), 2)
    avg_f1 = round(np.mean(f1_s), 6)
    std_f1 = round(np.std(f1_s), 6)

    return str(avg_train) + '+/-' + str(std_train), \
        str(avg_test) + '+/-' + str(std_test), str(avg_f1) + '+/-' + str(std_f1)

def show_results(df, *args):
    for i, arg in enumerate(args):
        print(f"{i+1}th call of rfc: {arg}")
        avg_train, avg_test, f1 = evaluate(arg)
        df.iloc[i] = avg_train, avg_test, f1
    return df

In [25]:
## reviewing the number of estimators used
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            n_estimators=value,
            max_depth=8
        ))
    return models

n_est = [50, 100, 200, 300, 500]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=n_est)

results_est = show_results(results_empty, *n_est)

calling rfe with n_est 50
calling rfe with n_est 100
calling rfe with n_est 200
calling rfe with n_est 300
calling rfe with n_est 500


In [26]:
results_est

Unnamed: 0,Train,Test,f1
50,0.999+/-0.0,0.835+/-0.0,0.206221+/-0.012949
100,1.0+/-0.0,0.835+/-0.0,0.207935+/-0.006996
200,1.0+/-0.0,0.836+/-0.0,0.203993+/-0.0094
300,1.0+/-0.0,0.836+/-0.0,0.20494+/-0.008733
500,1.0+/-0.0,0.836+/-0.0,0.199407+/-0.011285


In [31]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            n_estimators=100,
            max_depth=value
        ))
    return models

depths = [2, 4, 8, 12, 16, None]
results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=depths)

results_depth = show_results(results_empty, *get_models(depths))

1th call of rfc: RandomForestClassifier(max_depth=2, random_state=69)
2th call of rfc: RandomForestClassifier(max_depth=4, random_state=69)
3th call of rfc: RandomForestClassifier(max_depth=8, random_state=69)
4th call of rfc: RandomForestClassifier(max_depth=12, random_state=69)
5th call of rfc: RandomForestClassifier(max_depth=16, random_state=69)
6th call of rfc: RandomForestClassifier(random_state=69)


In [34]:
results_depth

Unnamed: 0,Train,Test,f1
2.0,0.825+/-0.0,0.825+/-0.0,0.0+/-0.0
4.0,0.826+/-0.0,0.825+/-0.0,0.008006+/-0.002687
8.0,0.838+/-0.0,0.831+/-0.0,0.102809+/-0.015984
12.0,0.863+/-0.0,0.834+/-0.0,0.157024+/-0.010716
16.0,0.911+/-0.0,0.835+/-0.0,0.191782+/-0.005791
,1.0+/-0.0,0.836+/-0.0,0.209879+/-0.007245


In [37]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            min_samples_split=value
        ))
    return models

splits = [2, 4, 8, 16, 32, 64, 128]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=splits)

results_min_split = show_results(results_empty, *get_models(splits))
results_min_split

1th call of rfc: RandomForestClassifier(random_state=69)
2th call of rfc: RandomForestClassifier(min_samples_split=4, random_state=69)
3th call of rfc: RandomForestClassifier(min_samples_split=8, random_state=69)
4th call of rfc: RandomForestClassifier(min_samples_split=16, random_state=69)
5th call of rfc: RandomForestClassifier(min_samples_split=32, random_state=69)
6th call of rfc: RandomForestClassifier(min_samples_split=64, random_state=69)
7th call of rfc: RandomForestClassifier(min_samples_split=128, random_state=69)


Unnamed: 0,Train,Test,f1
2,1.0+/-0.0,0.836+/-0.0,0.205121+/-0.003511
4,0.984+/-0.0,0.835+/-0.0,0.209059+/-0.011962
8,0.923+/-0.0,0.836+/-0.0,0.206717+/-0.007306
16,0.88+/-0.0,0.836+/-0.0,0.195921+/-0.013186
32,0.856+/-0.0,0.835+/-0.0,0.176999+/-0.013797
64,0.843+/-0.0,0.835+/-0.0,0.162672+/-0.009995
128,0.837+/-0.0,0.833+/-0.0,0.138544+/-0.010453


In [38]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            max_samples=value
        ))
    return models

samples = [0.1, 0.2, 0.4, 0.6, 0.8, None]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=samples)

results_samples = show_results(results_empty, *get_models(samples))
results_samples

1th call of rfc: RandomForestClassifier(max_samples=0.1, random_state=69)
2th call of rfc: RandomForestClassifier(max_samples=0.2, random_state=69)
3th call of rfc: RandomForestClassifier(max_samples=0.4, random_state=69)
4th call of rfc: RandomForestClassifier(max_samples=0.6, random_state=69)
5th call of rfc: RandomForestClassifier(max_samples=0.8, random_state=69)
6th call of rfc: RandomForestClassifier(random_state=69)


Unnamed: 0,Train,Test,f1
0.1,0.846+/-0.0,0.832+/-0.0,0.148549+/-0.008243
0.2,0.864+/-0.0,0.834+/-0.0,0.169845+/-0.008981
0.4,0.914+/-0.0,0.835+/-0.0,0.18544+/-0.005221
0.6,0.974+/-0.0,0.835+/-0.0,0.200419+/-0.016275
0.8,0.998+/-0.0,0.836+/-0.0,0.20819+/-0.011787
,1.0+/-0.0,0.836+/-0.0,0.212397+/-0.00629


In [39]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            max_features=value
        ))
    return models

samples = [0.1, 0.2, 0.4, 0.6, 0.8, None]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=samples)

results_feat = show_results(results_empty, *get_models(samples))
results_feat

1th call of rfc: RandomForestClassifier(max_features=0.1, random_state=69)
2th call of rfc: RandomForestClassifier(max_features=0.2, random_state=69)
3th call of rfc: RandomForestClassifier(max_features=0.4, random_state=69)
4th call of rfc: RandomForestClassifier(max_features=0.6, random_state=69)
5th call of rfc: RandomForestClassifier(max_features=0.8, random_state=69)
6th call of rfc: RandomForestClassifier(max_features=None, random_state=69)


Unnamed: 0,Train,Test,f1
0.1,1.0+/-0.0,0.834+/-0.0,0.172304+/-0.018237
0.2,1.0+/-0.0,0.836+/-0.0,0.221058+/-0.007162
0.4,1.0+/-0.0,0.837+/-0.0,0.257206+/-0.008665
0.6,1.0+/-0.0,0.835+/-0.0,0.264981+/-0.009177
0.8,1.0+/-0.0,0.836+/-0.0,0.281189+/-0.009322
,1.0+/-0.0,0.836+/-0.0,0.285535+/-0.010913


In [43]:
# performing a grid search with the best values

rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [12, 16, None],
    'min_samples_split': [4, 8, 16],
    'max_samples': [0.6, 0.8, None],
    'bootstrap': [True, False]
}

# Create GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_classifier = grid_search.best_estimator_

# Evaluate the performance on the test set
y_pred = best_rf_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Accuracy on Test Set: {accuracy}")
print(f"f1 on Test Set: {f1}")

Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_samples': None, 'min_samples_split': 4}
Accuracy on Test Set: 0.8312745648512072
f1 on Test Set: 0.2397216951296648


In [41]:
ch_rfc = RandomForestClassifier(min_samples_split = 4)

## MLP

### parameter tuning

will start directly with a random search, and then zoom in manually into the data

In [8]:
mlp_classifier = MLPClassifier(random_state=69)

param_dist = {
    'hidden_layer_sizes': [(50,), (100, 50,), (100, 100, 50,)],
    'max_iter': [10, 100, 200, 500],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128],
    'early_stopping': [True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [5],
}

random_search = RandomizedSearchCV(estimator=mlp_classifier, param_distributions=param_dist, n_iter=10, cv=5, scoring='f1', random_state=69, verbose=1)

random_search.fit(X_train, y_train)

results = random_search.cv_results_

top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END activation=sigmoid, alpha=0.01, batch_size=32, early_stopping=True, hidden_layer_sizes=(100, 100, 50), learning_rate_init=0.001, max_iter=100, n_iter_no_change=5, solver=adam, validation_fraction=0.1; total time=   0.0s
[CV] END activation=sigmoid, alpha=0.01, batch_size=32, early_stopping=True, hidden_layer_sizes=(100, 100, 50), learning_rate_init=0.001, max_iter=100, n_iter_no_change=5, solver=adam, validation_fraction=0.1; total time=   0.0s
[CV] END activation=sigmoid, alpha=0.01, batch_size=32, early_stopping=True, hidden_layer_sizes=(100, 100, 50), learning_rate_init=0.001, max_iter=100, n_iter_no_change=5, solver=adam, validation_fraction=0.1; total time=   0.0s
[CV] END activation=sigmoid, alpha=0.01, batch_size=32, early_stopping=True, hidden_layer_sizes=(100, 100, 50), learning_rate_init=0.001, max_iter=100, n_iter_no_change=5, solver=adam, validation_fraction=0.1; total time=   0.0s
[CV] END activation=sig

In [10]:
results

{'mean_fit_time': array([3.61919403e-03, 1.82109284e+00, 1.10654855e+00, 3.03378105e-03,
        6.78336620e-01, 1.89102073e+00, 3.79119730e+00, 1.65822263e+00,
        1.79598830e+01, 3.01470757e-03]),
 'std_fit_time': array([5.11930317e-04, 7.59441818e-01, 4.51067515e-01, 8.83858030e-05,
        1.29925489e-02, 7.05474773e-02, 3.95697924e-02, 1.69730612e-01,
        2.14975348e-01, 6.61634807e-04]),
 'mean_score_time': array([0.        , 0.0076519 , 0.00797229, 0.        , 0.00977125,
        0.00924149, 0.00781903, 0.01296096, 0.01214128, 0.        ]),
 'std_score_time': array([0.        , 0.00049389, 0.00104848, 0.        , 0.00078557,
        0.00066491, 0.00042658, 0.00129318, 0.00108396, 0.        ]),
 'param_validation_fraction': masked_array(data=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_solver': ma

In [20]:
chosen_mlp = MLPClassifier(validation_fraction = 0.1, solver = 'lbfgs', n_iter_no_change = 5, max_iter = 200, learning_rate_init = 0.001, hidden_layer_sizes = (100, 50, 20), early_stopping = True, batch_size = 32, alpha = 0.001, activation = 'relu')
chosen_mlp.fit(X_train, y_train)
y_train_pred = chosen_mlp.predict(X_train)
y_val_pred = chosen_mlp.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Train: Accuracy: 0.8763442586112805
Test: Accuracy: 0.7585626052779337, f1: 0.23111309789897186


In [42]:
gbc = GradientBoostingClassifier()
ch_rfc.fit(X_train, y_train)
y_train_pred = ch_rfc.predict(X_train)
y_val_pred = ch_rfc.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Train: Accuracy: 0.9878655580880229
Test: Accuracy: 0.840398652442448, f1: 0.2348586810228802


In [48]:
# gbc.fit(X, y)
test_pred = chosen_mlp.predict(test)

test_pred

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [49]:
to_submit = pd.DataFrame(test_pred, columns=['readmitted_binary'], index=test.index)

to_submit.head()

Unnamed: 0_level_0,readmitted_binary
encounter_id,Unnamed: 1_level_1
499502,0
447319,1
309126,0
181183,0
359339,0


In [50]:
to_submit = to_submit['readmitted_binary'].map({1: 'Yes', 0: 'No'})
to_submit.describe()

count     30530
unique        2
top          No
freq      24338
Name: readmitted_binary, dtype: object

In [53]:
to_submit.to_csv("../data/output/submission_nn_1.csv")

In [54]:
X.columns

Index(['age_mean', 'non_lab_procedures.1',
       'inpatient_visits_in_previous_year.1', 'average_pulse_bpm',
       'emergency_visits_in_previous_year.1', 'number_diagnoses',
       'number_of_medications_log', 'length_of_stay_in_hospital_log',
       'number_lab_tests.1', 'race_0', 'race_3', 'gender', 'age_0', 'age_2',
       'age_4', 'age_5', 'age_6', 'age_8', 'admission_type_4',
       'discharge_disposition', 'a1c_test_result_2',
       'change_in_meds_during_hospitalization', 'is_outpatient_visited',
       'is_emergency_visited', 'is_inpatient_visited',
       'discharge_disposition_cat_0', 'admission_source_cat_0',
       'admission_source_cat_1', 'admission_source_cat_3', 'med_glimepiride',
       'med_insulin', 'med_repaglinide', 'med_metformin'],
      dtype='object')