# Import packages and load data

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

# import logistic regression libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, roc_auc_score, RocCurveDisplay

# import random forest libraries
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler


In [65]:
# load data
df = pd.read_csv('hospital_readmissions.csv')
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


# One-hotting independent variables


In [66]:
# one-hotting diabetes med variable
df2 = pd.get_dummies(df, columns = ['diabetes_med'], drop_first = True, dtype = int)
df2.head()

# one-hotting age variable
df3 = pd.get_dummies(df2, columns = ['age'], drop_first = True, dtype = int)
df3.head()

# one-hotting medical-specialty variable
df4 = pd.get_dummies(df3, columns = ['medical_specialty'], drop_first = True, dtype = int)
df4

# one-hotting diag_1 variable
df5 = pd.get_dummies(df4, columns = ['diag_1'], drop_first = True, dtype = int)
df5

# one-hotting diag_2 variable
df6 = pd.get_dummies(df5, columns = ['diag_2'], drop_first = True, dtype = int)
df6

# one-hotting diag_3 variable
df7 = pd.get_dummies(df6, columns = ['diag_3'], drop_first = True, dtype = int)
df7

# one-hotting glucose_test variable
df8 = pd.get_dummies(df7, columns = ['glucose_test'], drop_first = True, dtype = int)
df8

# one-hotting A1Ctest variable
df9 = pd.get_dummies(df8, columns = ['A1Ctest'], drop_first = True, dtype = int)
df9

# one-hotting change variable
df10 = pd.get_dummies(df9, columns = ['change'], drop_first = True, dtype = int)
df10


Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,readmitted,diabetes_med_yes,age_[50-60),...,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory,glucose_test_no,glucose_test_normal,A1Ctest_no,A1Ctest_normal,change_yes
0,8,72,1,18,2,0,0,no,1,0,...,0,0,0,1,0,1,0,1,0,0
1,3,34,2,13,0,0,0,no,1,0,...,0,0,0,1,0,1,0,1,0,0
2,5,45,0,18,0,0,0,yes,1,1,...,0,0,0,0,0,1,0,1,0,1
3,2,36,0,12,1,0,0,yes,1,0,...,0,0,0,0,0,1,0,1,0,1
4,1,42,0,7,0,0,0,no,1,0,...,0,0,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,14,77,1,30,0,0,0,yes,0,0,...,0,0,0,0,0,1,0,0,1,0
24996,2,66,0,24,0,0,0,yes,1,0,...,0,0,0,1,0,1,0,0,0,1
24997,5,12,0,6,0,1,0,yes,0,0,...,0,0,0,1,0,0,1,1,0,0
24998,2,61,3,15,0,0,0,no,1,0,...,0,0,0,1,0,1,0,1,0,1


# One-hot readmission variable

In [67]:
df11 = pd.get_dummies(df10, columns = ['readmitted'], drop_first = True, dtype = int)
df11.head()


Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diabetes_med_yes,age_[50-60),age_[60-70),...,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory,glucose_test_no,glucose_test_normal,A1Ctest_no,A1Ctest_normal,change_yes,readmitted_yes
0,8,72,1,18,2,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
1,3,34,2,13,0,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
2,5,45,0,18,0,0,0,1,1,0,...,0,0,0,0,1,0,1,0,1,1
3,2,36,0,12,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,1,1
4,1,42,0,7,0,0,0,1,0,1,...,0,0,0,1,1,0,1,0,0,0


# Split columns into dependent variable, and independent variables

In [68]:
# Define features (X) and target (y)
X = df11.drop(['readmitted_yes'], axis=1)
y = df11['readmitted_yes']

# Split data points (rows) into training and testing sets

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [70]:
# initialize models
randomforest = RandomForestClassifier(max_depth = 6, random_state = 3)

# define hyperparameters to tune
params = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=randomforest, param_grid=params, cv=5, scoring='roc_auc', verbose=4,
                          return_train_score=True)

# Fit the model on the training data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=1.000, test=0.619) total time=   2.2s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=1.000, test=0.625) total time=   2.2s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=1.000, test=0.628) total time=   2.1s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=1.000, test=0.627) total time=   1.9s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=1.000, test=0.621) total time=   1.9s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=(train=1.000, test=0.628) total time=   3.8s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=(train=

In [71]:
# Check the best parameters and the corresponding score
best_parameters = grid_search.best_params_
best_auc_score = grid_search.best_score_
print(f"Best Parameters: {best_parameters}")
print(f"Best AUC Score: {best_auc_score}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred)
print(f"Test AUC: {test_auc}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best AUC Score: 0.6549319380665459
Test AUC: 0.6090253935880853
