# Traditional Machine Learning Approach  
## Hyperparameter Tuning with GridSearchCV

Author - Samuel Holt (23087175)

Models tested:

- Logistic Regression
- Random Forest
- XGBoost


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Capstone

# Packages
import numpy as np
import pandas as pd
import time

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

Mounted at /content/drive
/content/drive/MyDrive/Capstone


In [None]:
data = pd.read_csv('admitted.csv')
cols_to_rmv = ['Y', 'subject_id', 'stay_id', 'chiefcomplaint', 'in_date', 'admitted', 'disposition']

In [None]:
# create stratified train and test splits and drop cols
# create function to make base model, test base model, grid search CSV model, show improvments

data['Y'] = data['admitted']

data_sub = data.drop(columns=cols_to_rmv)
cat_cols = data_sub.select_dtypes('object').columns

data_sub = pd.get_dummies(data_sub, columns=cat_cols)
print(data_sub.columns)

y = data.Y
le =LabelEncoder()
le.fit(y)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(data_sub, y, test_size=0.25, random_state=42, stratify=y)

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


Index(['nights', 'hour', 'work_hours', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'critical', 'cardiac_arrest',
       'last_temperature', 'last_heartrate', 'last_resprate', 'last_o2sat',
       'last_sbp', 'last_dbp', 'last_pain', 'gender_F', 'gender_M',
       'arrival_transport_AMBULANCE', 'arrival_transport_HELICOPTER',
       'arrival_transport_OTHER', 'arrival_transport_UNKNOWN',
       'arrival_transport_WALK IN', 'race_class_ASIAN', 'race_class_BLACK',
       'race_class_LATINO', 'race_class_OTHER', 'race_class_WHITE'],
      dtype='object')


In [None]:
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()

lr_grid = {"C":np.logspace(-3,3,7), "penalty":["l2"]}

rf_grid = {'bootstrap': [True],
 'max_depth': [10, 20],
 'min_samples_leaf': [2, 4],
 'min_samples_split': [2, 5],
 'n_estimators': [100, 200]}

xgb_grid = {'objective':['binary:logistic'],
              'learning_rate': [0.05, 0.1],
              'max_depth': [6,7],
              'n_estimators': [100, 200]}

def get_best_params__and_performance(X_train, y_train, model, grid, cv=4):
  start_time = time.time()
  cv_model = GridSearchCV(model, 
                          grid, 
                          cv=cv)
  cv_model.fit(X_train, y_train)
  print("Best Params:",cv_model.best_params_)
  print("Best Accuracy:", cv_model.best_score_)
  total_time = time.time() - start_time 
  print('Total time (sec)', total_time)
  return cv_model.best_score_, cv_model.best_params_, total_time 

# print('LOGISTIC REGRESSION:')
# best_lr_score, best_lr_params, lr_total_time = get_best_params__and_performance(X_train_scaled, y_train, lr_model, lr_grid)
# print()
# print('RANDOM FOREST:')
# best_rf_score, best_rf_params, rf_total_time = get_best_params__and_performance(X_train, y_train, rf_model, rf_grid)
# print()
print("XGBOOST:")
best_xgb_score, best_xgb_params, xgb_total_time = get_best_params__and_performance(X_train, y_train, xgb_model, xgb_grid)

XGBOOST:
Best Params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'objective': 'binary:logistic'}
Best Accuracy: 0.7529707754298531
Total time (sec) 3057.754029273987


In [None]:
# get best params, train full model on train set
# get feat imps/SHAPs and conf mat/F1 on test set

best_lr = LogisticRegression(best_lr_params)
best_lr.fit(X_train_scaled, y_train)
lr_preds = best_lr.predict(X_test)
print('LOGISTIC REGRESSION:', 
      '\nAccuracy:', accuracy_score(y_test, lr_preds),
      '\nF1:', f1_score(y_test, lr_preds))
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, lr_preds), 
                              display_labels=['HOME', 'ADMITTED'])
disp.plot()

In [None]:
best_rf = RandomForestClassifier(best_rf_params)
best_rf.fit(X_train, y_train)
rf_preds = best_rf.predict(X_test)
print('RANDOM FOREST:', 
      '\nAccuracy:', accuracy_score(y_test, rf_preds),
      '\nF1:', f1_score(y_test, rf_preds))
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, rf_preds), 
                              display_labels=['HOME', 'ADMITTED'])
disp.plot()

In [None]:
best_xgb = xgb.XGBClassifier(best_xgb_params)
best_xgb.fit(X_train, y_train)
xgb_preds = best_xgb.predict(X_test)
print('XGBOOST:', 
      '\nAccuracy:', accuracy_score(y_test, xgb_preds),
      '\nF1:', f1_score(y_test, xgb_preds))
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, xgb_preds), 
                              display_labels=['HOME', 'ADMITTED'])
disp.plot()