## README

Please run this Jupyter Notebook in Python >= 3.12.

## Step 1. Read the Hospital Readmission Data

Readmission means a patient is readmitted to an in-patient hospital within 30 days (Medicare definition) after being discharged from in-patient hospital stays. Hospital readmission presents a significant challenge for both healthcare insurance companies and patients. 

In [None]:
import os
import pandas as pd
data_dir = 'data'
data_file = 'hospital_readmissions_new.csv'
data_path = os.path.join(data_dir, data_file)
data_df = pd.read_csv(data_path)
data_df.head()

## Step 2. Let's Split the Data into Training and Testing

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

x = data_df.iloc[:, 1:-1]
y = data_df.iloc[:, -1]
test_portion = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_portion, random_state=98053)
x_train.reset_index(inplace=True)
x_test.reset_index(inplace=True)
x_train.drop('index', axis=1, inplace=True)
x_test.drop('index', axis=1, inplace=True)


## Step 3. One-hot-encoding to Handle Categorical Variables in X

In [None]:
def one_hot_encoding(x_input, cat_cols=[], encoder=None):
    """
    This function does the one-hot encoding on X 
    data frame for a machine learning task. 
    params: x: Pandas Dataframe. Dataframe of independent variables for a machine learning task. 
            cat_col: list of categorical variable names. Default is empty. If empty, 
                     we have to detect automatically the list of categorical variables in x. 
            encoder: one hot encoder. Default None. When None, we have to learn 
                     the one hot encoder from x. Otherwise, directly apply
                     the one hot encoder to transform x.
    returns: Pandas dataframe with categorical variables in x tranformed by one-hot-encoding
             list of categorical columns in x.
             one-hot-encoder.
    """
    from sklearn.preprocessing import OneHotEncoder

    if encoder is None:
        if len(cat_cols) == 0:
            cat_cols = x_input.select_dtypes(include=['object']).columns.tolist()
        encoder = OneHotEncoder(sparse_output=False)
        one_hot_encoded = encoder.fit_transform(x_input[cat_cols])
    else:
        one_hot_encoded = encoder.transform(x_input[cat_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_cols))
    one_hot_df.reset_index(inplace=True)
    x_input.reset_index(inplace=True)
    df_encoded = pd.concat([x_input, one_hot_df], axis=1)
    x_encoded = df_encoded.drop(cat_cols, axis=1)
    x_encoded.reset_index(inplace=True)
    if 'level_0' in x_encoded.columns:
        x_encoded.drop('level_0', axis=1, inplace=True)
    if 'index' in x_encoded.columns:
        x_encoded.drop('index', axis=1, inplace=True)
    return x_encoded, cat_cols, encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

x_train_encoded, categorical_columns, encoder = one_hot_encoding(x_train, cat_cols=[], encoder=None)

x_test_encoded, _, _ = one_hot_encoding(x_test, cat_cols=categorical_columns, encoder=encoder)

print(f"Encoded Employee data : \n{x_train_encoded}")

In [None]:
print(x_train_encoded.shape)
print(x_test_encoded.shape)

## Train RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
rf = RandomForestClassifier(n_estimators=20, min_samples_leaf=3)
rf.fit(x_train_encoded, y_train)

## Test the RF Model on Testing Data

In [None]:
y_pred = rf.predict(x_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['Not Readmitted','Readmitted'],
            yticklabels=['Not Readmitted','Readmitted'])
plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top') 
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()

plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, 'Prediction', ha='center', fontsize=13)
plt.show()

## Now, Let's Take a Closer Look at the Data

What did you see?

In [None]:
data_df_new = data_df.copy()
data_df_new['Readmission_Location'] = data_df_new['Readmission_Location'].replace(np.nan, 'Unknown')
data_crosstab = pd.crosstab(data_df_new['Readmission_Location'], data_df_new['Readmitted'], margins = False)
print(data_crosstab)

## Discussion

What do you see?