# Testing out models (Version Virginia)

In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


## Get data

In [12]:
df = pd.read_csv('../raw_data/hospital_readmissions.csv')

In [13]:
# Clean data
df = df[df['diag_1'] != 'Missing']
df = df[df['diag_2'] != 'Missing']
df = df[df['diag_3'] != 'Missing']

In [14]:
# Separate features and target
X = df.drop('readmitted', axis=1)
y = df['readmitted'].map({'yes': 1, 'no': 0})

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

## Preprocessing

In [15]:
# Clean data function
def make_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df['n_lab_procedures_grouped'] = (df['n_lab_procedures'] // 10).astype(int)
    df['n_medications_grouped'] = (df['n_medications'] // 5).astype(int)
    df['n_outpatient'] = df['n_outpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_inpatient'] = df['n_inpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_emergency'] = df['n_emergency'].map({0: 0, 1: 1}).fillna(2).astype(int)

    df = df.drop(columns=['n_lab_procedures',
                          'medical_specialty',
                          'glucose_test',
                          'n_medications'],
                )
    return df

In [16]:
# Custom transformer for Label Encoding 'age' column
class AgeLabelEncoder:
    def fit(self, X, y=None):
        self.encoder = LabelEncoder()
        self.encoder.fit(X['age'])
        return self

    def transform(self, X):
        X = X.copy()
        X['age'] = self.encoder.transform(X['age'])
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [17]:
data_cleaner = FunctionTransformer(make_clean_data)
age_label_encoder = FunctionTransformer(lambda X: AgeLabelEncoder().fit_transform(X))

# Numeric preprocessing pipeline
num_preproc = Pipeline([
    ('scaler', MinMaxScaler()),
])

# Categorical preprocessing pipeline (excluding 'age')
categorical_columns = [col for col in data_cleaner.transform(X_train).select_dtypes(include=['object']).columns if col != 'age']
cat_preproc = Pipeline([
    ('ohe', OneHotEncoder(sparse_output=False, drop="if_binary")),
])

preproc = ColumnTransformer([
    ('age_label_encoder', age_label_encoder, ['age']),
    ('num_transf', num_preproc, make_column_selector(dtype_include='number')),
    ('cat_transf', cat_preproc, categorical_columns),
], verbose_feature_names_out=False).set_output(transform='pandas')

pipe_preproc = Pipeline([
    ('data_cleaner', data_cleaner),
    ('preprocessor', preproc),
])

pipe_preproc

In [18]:
# Fit and transform the training data
X_train_preprocessed = pipe_preproc.fit_transform(X_train)
X_val_preprocessed = pipe_preproc.transform(X_val)
X_test_preprocessed = pipe_preproc.transform(X_test)

### Scores

***Precision*** = High precision indicates that when the model predicts a readmission, it is usually correct. <br>
***Recall*** =  High recall indicates that the model correctly identifies a high percentage of actual readmissions. <br>
***F1-Score*** = A higher F1-score indicates a better balance between precision and recall. <br>
***AUC-ROC*** = A higher AUC-ROC value indicates better overall performance.

## Base model (Linear Regression)

In [24]:
# Baseline Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_preprocessed, y_train)

# Predictions and evaluation
y_val_pred = log_reg.predict(X_val_preprocessed)
print("Logistic Regression Validation Performance")
print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
logreg_score = accuracy_score(y_val, y_val_pred)

Logistic Regression Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.76      0.67      2580
           1       0.62      0.42      0.51      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.59      0.59      4956
weighted avg       0.61      0.60      0.59      4956

Validation Accuracy: 0.6009


## Other models

### SVC 

In [25]:
svc = SVC(random_state=42)
svc.fit(X_train_preprocessed, y_train)
y_val_pred_svc = svc.predict(X_val_preprocessed)
print("Support Vector Classifier Validation Performance")
print(classification_report(y_val, y_val_pred_svc))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_svc):.4f}")
svc_score = accuracy_score(y_val, y_val_pred_svc)

Support Vector Classifier Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.77      0.67      2580
           1       0.63      0.43      0.51      2376

    accuracy                           0.61      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.61      0.59      4956

Validation Accuracy: 0.6051


### Random Forest

In [26]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_preprocessed, y_train)
y_val_pred_rf = rf.predict(X_val_preprocessed)
print("Random Forest Validation Performance")
print(classification_report(y_val, y_val_pred_rf))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")
rf_score = accuracy_score(y_val, y_val_pred_rf)

Random Forest Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2580
           1       0.59      0.49      0.54      2376

    accuracy                           0.59      4956
   macro avg       0.59      0.59      0.59      4956
weighted avg       0.59      0.59      0.59      4956

Validation Accuracy: 0.5914


### Gradient Boosting

In [27]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_preprocessed, y_train)
y_val_pred_gb = gb.predict(X_val_preprocessed)
print("Gradient Boosting Validation Performance")
print(classification_report(y_val, y_val_pred_gb))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_gb):.4f}")
gb_score = accuracy_score(y_val, y_val_pred_gb)

Gradient Boosting Validation Performance
              precision    recall  f1-score   support

           0       0.60      0.74      0.66      2580
           1       0.62      0.46      0.52      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.60      0.60      4956

Validation Accuracy: 0.6043


### XGBoost

In [28]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_preprocessed, y_train)
y_val_pred_xgb = xgb.predict(X_val_preprocessed)
print("XGBoost Validation Performance")
print(classification_report(y_val, y_val_pred_xgb))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_xgb):.4f}")
xgb_score = accuracy_score(y_val, y_val_pred_xgb)

XGBoost Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2580
           1       0.58      0.50      0.54      2376

    accuracy                           0.59      4956
   macro avg       0.59      0.59      0.58      4956
weighted avg       0.59      0.59      0.59      4956

Validation Accuracy: 0.5896


### DecisionTreeClassifier

In [29]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_preprocessed, y_train)
y_val_pred_dt = dt.predict(X_val_preprocessed)
print("Decision Tree Validation Performance")
print(classification_report(y_val, y_val_pred_dt))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_dt):.4f}")
dt_score = accuracy_score(y_val, y_val_pred_dt)

Decision Tree Validation Performance
              precision    recall  f1-score   support

           0       0.56      0.58      0.57      2580
           1       0.53      0.52      0.52      2376

    accuracy                           0.55      4956
   macro avg       0.55      0.55      0.55      4956
weighted avg       0.55      0.55      0.55      4956

Validation Accuracy: 0.5468


### AdaBoostClassifier

In [30]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_preprocessed, y_train)
y_val_pred_ada = ada.predict(X_val_preprocessed)
print("AdaBoost Validation Performance")
print(classification_report(y_val, y_val_pred_ada))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_ada):.4f}")
ada_score = accuracy_score(y_val, y_val_pred_ada)



AdaBoost Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.75      0.67      2580
           1       0.62      0.44      0.52      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.60      0.59      4956

Validation Accuracy: 0.6045


### Test set evaluation with the best model

In [32]:
# Creating a DataFrame
accuracy = {
    'Model': ['Logistic Regression', 'SVC', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'Decision Tree', 'AdaBoost'],
    'Accuracy Score': [logreg_score, svc_score, rf_score, gb_score, xgb_score, dt_score, ada_score]
}

df_accuracy = pd.DataFrame(accuracy).sort_values(by='Accuracy Score', ascending=False).reset_index(drop=True)
df_accuracy

Unnamed: 0,Model,Accuracy Score
0,SVC,0.605125
1,AdaBoost,0.60452
2,Gradient Boosting,0.604318
3,Logistic Regression,0.600888
4,Random Forest,0.591404
5,XGBoost,0.589588
6,Decision Tree,0.546812


SVC Performed best

In [33]:
best_model = svc
y_test_pred = best_model.predict(X_test_preprocessed)
print("Best Model Test Performance")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

Best Model Test Performance
              precision    recall  f1-score   support

           0       0.60      0.76      0.67      2559
           1       0.64      0.45      0.53      2397

    accuracy                           0.61      4956
   macro avg       0.62      0.60      0.60      4956
weighted avg       0.62      0.61      0.60      4956

Test Accuracy: 0.6098
