In [1]:
import warnings
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LassoCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector, RFE, SelectKBest, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from scipy.stats import boxcox, chi2_contingency
from scipy.stats.mstats import winsorize
from imblearn.over_sampling import BorderlineSMOTE
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

sns.set()
sns.set_palette('cividis')

warnings.filterwarnings('ignore')

In [2]:
train_path = os.path.join("..", "data", "clean", "x_train_encoded.csv")
val_path = os.path.join("..", "data", "clean", "x_val_encoded.csv")
test_path = os.path.join("..", "data", "clean", "test_clean.csv")
y_train_path = os.path.join("..", "data", "clean", "y_train_copy.csv")
y_val_path = os.path.join("..", "data", "clean", "y_val_copy_10.csv")

X_train = pd.read_csv(train_path, index_col=0)
X_val = pd.read_csv(val_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)
y_train = pd.read_csv(y_train_path, index_col=0)
y_val = pd.read_csv(y_val_path, index_col=0)

In [3]:
# X_train.shape[0] + X_val.shape[0] == y.shape[0]

In [4]:
(test.isna().sum() > 0).values

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [5]:
test = test[X_train.columns]

In [6]:
test.loc[:, test.columns != X_train.columns].columns

Index([], dtype='object')

In [7]:
X_train.loc[:, test.columns != X_train.columns].columns

Index([], dtype='object')

In [8]:
X_train.head()

Unnamed: 0,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,average_pulse_bpm,length_of_stay_in_hospital,number_lab_tests,non_lab_procedures,number_of_medications,number_diagnoses,gender,age,...,admission_source_ Transfer from Ambulatory Surgery Center,admission_source_ Transfer from a Skilled Nursing Facility (SNF),admission_source_ Transfer from another health care facility,admission_source_ Transfer from critial access hospital,admission_source_ Transfer from hospital inpt/same fac reslt in a sep claim,admission_source_Clinic Referral,admission_source_HMO Referral,admission_source_Normal Delivery,admission_source_Transfer from a hospital,admission_source_Unknown
0,-0.376484,-0.742379,-1.41684,-0.893048,1.727976,-0.956069,0.303272,0.316693,0,7,...,False,False,False,False,False,False,False,False,False,False
1,3.018289,1.678655,-1.243196,-0.352243,-1.682598,-0.956069,-0.090359,0.722132,0,9,...,False,False,False,False,False,False,False,False,False,True
2,-0.376484,-0.742379,1.491699,0.409978,0.952846,1.816751,1.257188,-0.650392,0,5,...,False,False,False,False,False,False,False,False,False,True
3,-0.376484,-0.742379,0.927356,-0.893048,-0.184012,-0.956069,-0.572126,0.722132,0,6,...,False,False,False,False,False,False,False,False,False,False
4,-0.376484,-0.742379,0.927356,-0.352243,0.901171,0.744078,-0.239315,-0.136549,1,7,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# y_train = y.loc[X_train.index]
# y_val = y.loc[X_val.index]

# for the imported y
y = pd.concat([y_train, y_val], axis=0)

In [10]:
X = pd.concat([X_train, X_val], axis=0)
X.shape

(115504, 130)

## Random Forest

In [10]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_train_pred_rfc = rfc.predict(X_train)
y_pred_rfc = rfc.predict(X_val)

print(f"Train: Accuracy: {accuracy_score(y_train, y_train_pred_rfc)}, f1: {f1_score(y_train, y_train_pred_rfc)}")
print(f"Test: Accuracy: {accuracy_score(y_val, y_pred_rfc)}, f1: {f1_score(y_val, y_pred_rfc)}")

Train: Accuracy: 1.0, f1: 1.0
Test: Accuracy: 0.839135317237507, f1: 0.2219959266802444


### parameter tuning

In [8]:
def evaluate(model):
    
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    score_train = []
    score_test = []
    f1_s = []
    
    for train_index, test_index in skf.split(X_train, y_train):
        X_train, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        value_train = accuracy_score(y_train, y_train_pred)
        value_test = accuracy_score(y_val, y_val_pred)
        value_f1 = f1_score(y_val, y_val_pred)
        
        score_train.append(value_train)
        score_test.append(value_test)
        f1_s.append(value_f1)
        
    avg_train = round(np.mean(score_train), 3)
    avg_test = round(np.mean(score_test), 3)
    std_train = round(np.std(score_train), 2)
    std_test = round(np.std(score_test), 2)
    avg_f1 = round(np.mean(f1_s), 6)
    std_f1 = round(np.std(f1_s), 6)

    return str(avg_train) + '+/-' + str(std_train), \
        str(avg_test) + '+/-' + str(std_test), str(avg_f1) + '+/-' + str(std_f1)

def show_results(df, *args):
    for i, arg in enumerate(args):
        print(f"{i+1}th call of rfc: {arg}")
        avg_train, avg_test, f1 = evaluate(arg)
        df.iloc[i] = avg_train, avg_test, f1
    return df

In [10]:
## reviewing the number of estimators used
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            n_estimators=value,
            max_depth=8
        ))
    return models

n_est = [50, 100, 200, 300, 500]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=n_est)

results_est = show_results(results_empty, *get_models(n_est))

1th call of rfc: RandomForestClassifier(max_depth=8, n_estimators=50, random_state=69)
2th call of rfc: RandomForestClassifier(max_depth=8, random_state=69)
3th call of rfc: RandomForestClassifier(max_depth=8, n_estimators=200, random_state=69)
4th call of rfc: RandomForestClassifier(max_depth=8, n_estimators=300, random_state=69)
5th call of rfc: RandomForestClassifier(max_depth=8, n_estimators=500, random_state=69)


In [11]:
results_est

Unnamed: 0,Train,Test,f1
50,0.655+/-0.0,0.648+/-0.0,0.645488+/-0.008197
100,0.656+/-0.0,0.649+/-0.0,0.647746+/-0.007273
200,0.658+/-0.0,0.65+/-0.0,0.64778+/-0.002413
300,0.658+/-0.0,0.65+/-0.0,0.650903+/-0.005649
500,0.659+/-0.0,0.65+/-0.0,0.65064+/-0.003322


In [12]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            n_estimators=100,
            max_depth=value
        ))
    return models

depths = [2, 4, 8, 12, 16, None]
results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=depths)

results_depth = show_results(results_empty, *get_models(depths))

results_depth

1th call of rfc: RandomForestClassifier(max_depth=2, random_state=69)
2th call of rfc: RandomForestClassifier(max_depth=4, random_state=69)
3th call of rfc: RandomForestClassifier(max_depth=8, random_state=69)
4th call of rfc: RandomForestClassifier(max_depth=12, random_state=69)
5th call of rfc: RandomForestClassifier(max_depth=16, random_state=69)
6th call of rfc: RandomForestClassifier(random_state=69)


Unnamed: 0,Train,Test,f1
2.0,0.613+/-0.0,0.612+/-0.0,0.58351+/-0.011287
4.0,0.624+/-0.0,0.623+/-0.0,0.618239+/-0.013842
8.0,0.656+/-0.0,0.649+/-0.0,0.647173+/-0.005781
12.0,0.728+/-0.0,0.705+/-0.0,0.705984+/-0.00273
16.0,0.841+/-0.0,0.801+/-0.0,0.80331+/-0.002756
,1.0+/-0.0,0.996+/-0.0,0.996277+/-0.000683


In [13]:
results_depth

Unnamed: 0,Train,Test,f1
2.0,0.59+/-0.01,0.589+/-0.0,0.263862+/-0.028116
4.0,0.617+/-0.0,0.615+/-0.0,0.407019+/-0.012767
8.0,0.651+/-0.0,0.646+/-0.0,0.499245+/-0.003593
12.0,0.722+/-0.0,0.702+/-0.0,0.609754+/-0.00611
16.0,0.831+/-0.0,0.795+/-0.0,0.748823+/-0.004432
,1.0+/-0.0,0.984+/-0.0,0.981708+/-0.000528


In [37]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            min_samples_split=value
        ))
    return models

splits = [2, 4, 8, 16, 32, 64, 128]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=splits)

results_min_split = show_results(results_empty, *get_models(splits))
results_min_split

1th call of rfc: RandomForestClassifier(random_state=69)
2th call of rfc: RandomForestClassifier(min_samples_split=4, random_state=69)
3th call of rfc: RandomForestClassifier(min_samples_split=8, random_state=69)
4th call of rfc: RandomForestClassifier(min_samples_split=16, random_state=69)
5th call of rfc: RandomForestClassifier(min_samples_split=32, random_state=69)
6th call of rfc: RandomForestClassifier(min_samples_split=64, random_state=69)
7th call of rfc: RandomForestClassifier(min_samples_split=128, random_state=69)


Unnamed: 0,Train,Test,f1
2,1.0+/-0.0,0.836+/-0.0,0.205121+/-0.003511
4,0.984+/-0.0,0.835+/-0.0,0.209059+/-0.011962
8,0.923+/-0.0,0.836+/-0.0,0.206717+/-0.007306
16,0.88+/-0.0,0.836+/-0.0,0.195921+/-0.013186
32,0.856+/-0.0,0.835+/-0.0,0.176999+/-0.013797
64,0.843+/-0.0,0.835+/-0.0,0.162672+/-0.009995
128,0.837+/-0.0,0.833+/-0.0,0.138544+/-0.010453


In [38]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            max_samples=value
        ))
    return models

samples = [0.1, 0.2, 0.4, 0.6, 0.8, None]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=samples)

results_samples = show_results(results_empty, *get_models(samples))
results_samples

1th call of rfc: RandomForestClassifier(max_samples=0.1, random_state=69)
2th call of rfc: RandomForestClassifier(max_samples=0.2, random_state=69)
3th call of rfc: RandomForestClassifier(max_samples=0.4, random_state=69)
4th call of rfc: RandomForestClassifier(max_samples=0.6, random_state=69)
5th call of rfc: RandomForestClassifier(max_samples=0.8, random_state=69)
6th call of rfc: RandomForestClassifier(random_state=69)


Unnamed: 0,Train,Test,f1
0.1,0.846+/-0.0,0.832+/-0.0,0.148549+/-0.008243
0.2,0.864+/-0.0,0.834+/-0.0,0.169845+/-0.008981
0.4,0.914+/-0.0,0.835+/-0.0,0.18544+/-0.005221
0.6,0.974+/-0.0,0.835+/-0.0,0.200419+/-0.016275
0.8,0.998+/-0.0,0.836+/-0.0,0.20819+/-0.011787
,1.0+/-0.0,0.836+/-0.0,0.212397+/-0.00629


In [39]:
def get_models(values):
    models = []
    for value in values:
        models.append(RandomForestClassifier(
            random_state=69,
            max_features=value
        ))
    return models

samples = [0.1, 0.2, 0.4, 0.6, 0.8, None]

results_empty = pd.DataFrame(columns=['Train', 'Test', 'f1'], index=samples)

results_feat = show_results(results_empty, *get_models(samples))
results_feat

1th call of rfc: RandomForestClassifier(max_features=0.1, random_state=69)
2th call of rfc: RandomForestClassifier(max_features=0.2, random_state=69)
3th call of rfc: RandomForestClassifier(max_features=0.4, random_state=69)
4th call of rfc: RandomForestClassifier(max_features=0.6, random_state=69)
5th call of rfc: RandomForestClassifier(max_features=0.8, random_state=69)
6th call of rfc: RandomForestClassifier(max_features=None, random_state=69)


Unnamed: 0,Train,Test,f1
0.1,1.0+/-0.0,0.834+/-0.0,0.172304+/-0.018237
0.2,1.0+/-0.0,0.836+/-0.0,0.221058+/-0.007162
0.4,1.0+/-0.0,0.837+/-0.0,0.257206+/-0.008665
0.6,1.0+/-0.0,0.835+/-0.0,0.264981+/-0.009177
0.8,1.0+/-0.0,0.836+/-0.0,0.281189+/-0.009322
,1.0+/-0.0,0.836+/-0.0,0.285535+/-0.010913


In [43]:
# performing a grid search with the best values

rf_classifier = RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'max_depth': [12, 16, None],
    'min_samples_split': [4, 8, 16],
    'max_samples': [0.6, 0.8, None],
    'bootstrap': [True, False]
}

# Create GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_classifier = grid_search.best_estimator_

# Evaluate the performance on the test set
y_pred = best_rf_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Accuracy on Test Set: {accuracy}")
print(f"f1 on Test Set: {f1}")

Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_samples': None, 'min_samples_split': 4}
Accuracy on Test Set: 0.8312745648512072
f1 on Test Set: 0.2397216951296648


In [14]:
ch_rfc = RandomForestClassifier(random_state=69)
ch_rfc.fit(X_train, y_train)
y_train_pred = ch_rfc.predict(X_train)
y_val_pred = ch_rfc.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Train: Accuracy: 1.0
Test: Accuracy: 0.8884755755193712, f1: 0.03755299818291944


## MLP

### parameter tuning

will start directly with a random search, and then zoom in manually into the data

In [10]:
mlp_classifier = MLPClassifier()

param_dist = {
    'hidden_layer_sizes': [(50,), (100, 50,), (100, 100, 50,)],
    'max_iter': [10, 100, 200, 500],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128],
    'early_stopping': [True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [5],
}

random_search = RandomizedSearchCV(estimator=mlp_classifier, param_distributions=param_dist, n_iter=10, cv=5, scoring='f1', verbose=1)

random_search.fit(X_train, y_train)

results = random_search.cv_results_

top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Top 1 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'adam', 'n_iter_no_change': 5, 'max_iter': 10, 'learning_rate_init': 0.01, 'hidden_layer_sizes': (100, 100, 50), 'early_stopping': True, 'batch_size': 64, 'alpha': 0.0001, 'activation': 'sigmoid'}
Mean Test Score: nan
Standard Deviation of Test Score: nan

Top 2 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'sgd', 'n_iter_no_change': 5, 'max_iter': 100, 'learning_rate_init': 0.001, 'hidden_layer_sizes': (100, 100, 50), 'early_stopping': True, 'batch_size': 64, 'alpha': 0.001, 'activation': 'sigmoid'}
Mean Test Score: nan
Standard Deviation of Test Score: nan

Top 3 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'adam', 'n_iter_no_change': 5, 'max_iter': 100, 'learning_rate_init': 0.001, 'hidden_layer_sizes': (100, 100, 50), 'early_stopping': True, 'batch_size': 32, 'alpha': 0.01, 'activation': 'sigmoid'}
Mean Test Score: nan
Standard Devia

In [25]:
mlp_classifier = MLPClassifier()

param_dist = {
    "learning_rate_init": [0.0001, 0.001, 0.01, 0.1],
    "momentum": [0.5, 0.7, 0.9],
    "hidden_layer_sizes": [(256, 128, 64), (128, 64), (64, 32)],
    "activation": ["relu", "logistic", "tanh"],
    "solver": ['adam', 'sgd', 'lbfgs'],
    "batch_size": [64, 128, 256],
    "alpha": [0.0001, 0.001, 0.01],
    'early_stopping': [True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [5],
}

random_search = RandomizedSearchCV(mlp_classifier, param_dist, n_iter=10, cv=5, scoring='f1', verbose=1)

random_search.fit(X_train, y_train)

results = random_search.cv_results_

top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Top 1 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'sgd', 'n_iter_no_change': 5, 'momentum': 0.5, 'learning_rate_init': 0.1, 'hidden_layer_sizes': (256, 128, 64), 'early_stopping': True, 'batch_size': 128, 'alpha': 0.0001, 'activation': 'relu'}
Mean Test Score: 0.9166217178852113
Standard Deviation of Test Score: 0.023604917372087413

Top 2 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'sgd', 'n_iter_no_change': 5, 'momentum': 0.9, 'learning_rate_init': 0.01, 'hidden_layer_sizes': (128, 64), 'early_stopping': True, 'batch_size': 256, 'alpha': 0.0001, 'activation': 'tanh'}
Mean Test Score: 0.8267892267393243
Standard Deviation of Test Score: 0.10215802120102768

Top 3 Model:
Parameters: {'validation_fraction': 0.1, 'solver': 'adam', 'n_iter_no_change': 5, 'momentum': 0.9, 'learning_rate_init': 0.001, 'hidden_layer_sizes': (64, 32), 'early_stopping': True, 'batch_size': 128, 'alpha': 0.0001, 'activatio

In [21]:
# validation_fraction': 0.1, 'solver': 'lbfgs', 'n_iter_no_change': 5, 'max_iter': 200, 'learning_rate_init': 0.001, 'hidden_layer_sizes': (100, 100, 50), 'early_stopping': True, 'batch_size': 32, 'alpha': 0.001, 'activation': 'tanh'

chosen_mlp = MLPClassifier(
    hidden_layer_sizes=(32, 64, 128),
    activation='relu',
    solver='sgd',
    alpha=0.01,
    batch_size=64,
    learning_rate_init=0.01,
    momentum=0.9,
    n_iter_no_change=5,
    validation_fraction=0.1,
    early_stopping=True,
    verbose=2
)

chosen_mlp.fit(X_train, y_train)
y_train_pred = chosen_mlp.predict(X_train)
y_val_pred = chosen_mlp.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Iteration 1, loss = 0.67078289
Validation score: 0.619001
Iteration 2, loss = 0.65882099
Validation score: 0.624531
Iteration 3, loss = 0.65220713
Validation score: 0.633320
Iteration 4, loss = 0.64667128
Validation score: 0.636184
Iteration 5, loss = 0.64157564
Validation score: 0.639344
Iteration 6, loss = 0.63612309
Validation score: 0.635789
Iteration 7, loss = 0.62993289
Validation score: 0.647541
Iteration 8, loss = 0.62459503
Validation score: 0.641319
Iteration 9, loss = 0.62077738
Validation score: 0.644183
Iteration 10, loss = 0.61489705
Validation score: 0.656627
Iteration 11, loss = 0.60944851
Validation score: 0.660873
Iteration 12, loss = 0.60422024
Validation score: 0.668872
Iteration 13, loss = 0.60028806
Validation score: 0.666206
Iteration 14, loss = 0.59491330
Validation score: 0.670650
Iteration 15, loss = 0.59134499
Validation score: 0.674896
Iteration 16, loss = 0.58640554
Validation score: 0.673217
Iteration 17, loss = 0.58162256
Validation score: 0.671242
Iterat

In [24]:
# paper suggested mlp

chosen_mlp = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=200,
    momentum=0.9,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
)

chosen_mlp.fit(X_train, y_train)
y_train_pred = chosen_mlp.predict(X_train)
y_val_pred = chosen_mlp.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Train: Accuracy: 0.9920004740459825
Test: Accuracy: 0.8291690061763054, f1: 0.17714672075726842


## Gradient Boosting Classifier

In [16]:
gb_classifier = GradientBoostingClassifier()

param_grid = {
    "loss": ["log_loss", "exponential"],
    "criterion": ["friedman_mse", "squared_error"],
    "subsample": [0.4, 0.8, 1.0],
    "learning_rate": [0.1, 0.05, 0.01],
    "n_estimators": [100, 200, 500, 1000],
    "max_depth": [3, 5, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "min_impurity_decrease": [0.0, 0.01],
    "max_features": ["sqrt", "log2", None]
}

random_search = RandomizedSearchCV(gb_classifier, param_grid, n_iter=10, cv=5, scoring='f1', random_state=69, n_jobs=-1, verbose=2)

random_search.fit(X_train, y_train)

results = random_search.cv_results_

top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyError: 'mean_test_f1'

In [17]:
top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")


Top 1 Model:
Parameters: {'subsample': 1.0, 'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_features': None, 'max_depth': 8, 'loss': 'log_loss', 'learning_rate': 0.1, 'criterion': 'squared_error'}
Mean Test Score: 0.9270886914843464
Standard Deviation of Test Score: 0.0032211162615861286

Top 2 Model:
Parameters: {'subsample': 0.8, 'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0, 'max_features': 'sqrt', 'max_depth': 5, 'loss': 'exponential', 'learning_rate': 0.05, 'criterion': 'squared_error'}
Mean Test Score: 0.7216418164139244
Standard Deviation of Test Score: 0.0009307495164233584

Top 3 Model:
Parameters: {'subsample': 0.4, 'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.01, 'max_features': 'sqrt', 'max_depth': 8, 'loss': 'log_loss', 'learning_rate': 0.01, 'criterion': 'friedman_mse'}
Mean Test Score: 0.7206978986619772
Standard

In [14]:
gbc = GradientBoostingClassifier(
    subsample = 0.8,
    n_estimators = 200,
    min_samples_split = 5,
    min_samples_leaf = 2,
    min_impurity_decrease = 0.0,
    max_features = None,
    max_depth = 5,
    loss = 'exponential',
    learning_rate = 0.1,
    criterion = 'friedman_mse',
    random_state=69,
    verbose=1
)

gbc.fit(X_train, y_train)
y_train_pred = gbc.predict(X_train)
y_val_pred = gbc.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9919           0.0081           54.43s
         2           0.9850           0.0066           53.29s
         3           0.9788           0.0053           53.13s
         4           0.9742           0.0060           52.83s
         5           0.9698           0.0034           52.65s
         6           0.9664           0.0049           52.84s
         7           0.9633           0.0040           53.53s
         8           0.9602           0.0021           53.98s
         9           0.9577           0.0025           53.49s
        10           0.9553           0.0017           53.57s
        20           0.9414           0.0020           51.66s
        30           0.9336           0.0076           46.79s
        40           0.9269          -0.0007           42.97s
        50           0.9223           0.0015           39.62s
        60           0.9156          -0.0046           36.58s
       

## SVM

### SVClassifier

In [12]:
svc = SVC()

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': [0.01, 0.1, 'auto', 'scale']} 

random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, scoring='f1', n_jobs=-1, verbose=3)

random_search.fit(X_train, y_train)

results = random_search.cv_results_

top_8_indices = results['mean_test_score'].argsort()[-8:][::-1]

for i, idx in enumerate(top_8_indices):
    print(f"\nTop {i + 1} Model:")
    print(f"Parameters: {results['params'][idx]}")
    print(f"Mean Test Score: {results['mean_test_score'][idx]}")
    print(f"Standard Deviation of Test Score: {results['std_test_score'][idx]}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Naive Bayes

In [13]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)
y_train_pred = gnb.predict(X_train)
y_val_pred = gnb.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

Train: Accuracy: 0.5161471912775539
Test: Accuracy: 0.15054744525547445, f1: 0.20505747126436782


## Ensamble

In [23]:
sc = StackingClassifier(estimators=[
    ('mlp', chosen_mlp),
    ('gbc', gbc),
    ('gnb', gnb)
])

sc.fit(X_train, y_train)
y_train_pred = sc.predict(X_train)
y_val_pred = sc.predict(X_val)

value_train = accuracy_score(y_train, y_train_pred)
value_test = accuracy_score(y_val, y_val_pred)
value_f1 = f1_score(y_val, y_val_pred)

print(f"Train: Accuracy: {value_train}")
print(f"Test: Accuracy: {value_test}, f1: {value_f1}")

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9919           0.0081           55.72s
         2           0.9850           0.0066           55.22s
         3           0.9788           0.0053           55.25s
         4           0.9742           0.0060           54.88s
         5           0.9698           0.0034           54.48s
         6           0.9664           0.0049           54.36s
         7           0.9633           0.0040           54.06s
         8           0.9602           0.0021           53.94s
         9           0.9577           0.0025           53.75s
        10           0.9553           0.0017           53.49s
        20           0.9414           0.0020           49.72s
        30           0.9336           0.0076           46.17s
        40           0.9269          -0.0007           42.50s
        50           0.9223           0.0015           39.34s
        60           0.9156          -0.0046           36.36s
       

In [24]:
# gbc.fit(X, y)
test_pred = sc.predict(test)

test_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [25]:
og_test_path = os.path.join("..", "data", "input", "test.csv")
og_test = pd.read_csv(og_test_path, index_col=0)

In [26]:
to_submit = pd.DataFrame(test_pred, columns=['readmitted_binary'], index=og_test.index)

to_submit.head()

Unnamed: 0_level_0,readmitted_binary
encounter_id,Unnamed: 1_level_1
499502,0
447319,0
309126,0
181183,0
359339,0


In [27]:
to_submit = to_submit['readmitted_binary'].map({1: 'Yes', 0: 'No'})
to_submit.describe()

count     30530
unique        2
top          No
freq      20938
Name: readmitted_binary, dtype: object

In [29]:
to_submit.to_csv("../data/output/submission_sc_1.csv")

In [54]:
X.columns

Index(['age_mean', 'non_lab_procedures.1',
       'inpatient_visits_in_previous_year.1', 'average_pulse_bpm',
       'emergency_visits_in_previous_year.1', 'number_diagnoses',
       'number_of_medications_log', 'length_of_stay_in_hospital_log',
       'number_lab_tests.1', 'race_0', 'race_3', 'gender', 'age_0', 'age_2',
       'age_4', 'age_5', 'age_6', 'age_8', 'admission_type_4',
       'discharge_disposition', 'a1c_test_result_2',
       'change_in_meds_during_hospitalization', 'is_outpatient_visited',
       'is_emergency_visited', 'is_inpatient_visited',
       'discharge_disposition_cat_0', 'admission_source_cat_0',
       'admission_source_cat_1', 'admission_source_cat_3', 'med_glimepiride',
       'med_insulin', 'med_repaglinide', 'med_metformin'],
      dtype='object')