In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [3]:

df = pd.read_csv("diabetic_data.csv")

df.replace("?", np.nan, inplace=True)
df.dropna(thresh=int(0.8*len(df)), axis=1, inplace=True)
df.dropna(inplace=True)


In [4]:
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)


In [5]:

for col in df.select_dtypes(include="object"):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [6]:
X = df.drop("readmitted", axis=1)
y = df["readmitted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [7]:

rf = RandomForestClassifier(
    n_estimators=300,
    max_leaf_nodes=100,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])


              precision    recall  f1-score   support

           0       0.92      0.67      0.77     17398
           1       0.18      0.56      0.27      2213

    accuracy                           0.66     19611
   macro avg       0.55      0.61      0.52     19611
weighted avg       0.84      0.66      0.72     19611



0.6560935088341644

In [8]:

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.83      0.86     17398
           1       0.20      0.34      0.25      2213

    accuracy                           0.77     19611
   macro avg       0.55      0.58      0.56     19611
weighted avg       0.83      0.77      0.80     19611



In [9]:
importances = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

important_features = importances.head(10)
important_features


number_inpatient            0.313046
discharge_disposition_id    0.127380
encounter_id                0.053920
number_emergency            0.053909
diag_1                      0.049910
patient_nbr                 0.049487
time_in_hospital            0.042026
num_medications             0.040511
diag_3                      0.037541
number_diagnoses            0.035953
dtype: float64

In [10]:
X_imp = X[important_features.index]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_imp, y, test_size=0.2, stratify=y, random_state=42
)

rf.fit(X_train_i, y_train_i)
y_pred_i = rf.predict(X_test_i)

print(classification_report(y_test_i, y_pred_i))


              precision    recall  f1-score   support

           0       0.92      0.68      0.78     17398
           1       0.18      0.55      0.27      2213

    accuracy                           0.66     19611
   macro avg       0.55      0.61      0.53     19611
weighted avg       0.84      0.66      0.72     19611

