This file implements a classification model using XGBoost to predict patient diagnoses based on demographic and administrative features. This model was developed to help the hospital identify patient profiles with a high risk for specific diagnoses.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, auc
from itertools import cycle

Data Preparation

In [None]:
# Prepare data for classification
# Focus on the 5 diagnoses with the highest historical support
top5_icd10 = ['A09.9', 'S09.8', 'Z03.8', 'I64', 'E11.2']
df_inap_classif = df_rawatinap[df_rawatinap['icd_10'].isin(top5_icd10)].copy()

# Feature Engineering
# Create new features from date data
df_inap_classif['bulan_admisi'] = pd.to_datetime(df_inap_classif['tanggal_admisi'])
df_inap_classif['bulan_num'] = df_inap_classif['bulan_admisi'].dt.month
df_inap_classif['tahun_num'] = df_inap_classif['bulan_admisi'].dt.year
df_inap_classif['is_weekend'] = df_inap_classif['bulan_admisi'].dt.weekday.isin([5,6]).astype(int)

# Define features (X) and target (y)
X = df_inap_classif[[
    "umur", "lama_rawat", "jenis_kelamin", "cara_bayar",
    "status_disposisi", "tindakan", "bulan_num", "tahun_num", "is_weekend"
]]
y = df_inap_classif["icd_10"]

# Convert target labels to a numerical format for the model
le_icd = LabelEncoder()
y_encoded = le_icd.fit_transform(y)

Data Splitting

In [None]:
# Split data into training (80%) and testing (20%) sets
# Use stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

Model Training & Tuning

In [None]:
# Define categorical and numerical features
numeric_features = ["umur", "lama_rawat", "bulan_num", "tahun_num", "is_weekend"]
categorical_features = ["jenis_kelamin", "cara_bayar", "status_disposisi", "tindakan"]

# Preprocessor for OneHotEncoding on categorical features
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
], remainder='passthrough')

# Use a Pipeline to combine preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

# --- Assumption: Hyperparameter tuning was performed to find the best parameters ---
# The full tuning code is available in the technical documentation (README)
best_params = {
    'classifier__learning_rate': 0.05,
    'classifier__max_depth': 6,
    'classifier__n_estimators': 500,
    'classifier__subsample': 0.8,
    'classifier__colsample_bytree': 0.8
}

# Train the model using the best parameters
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

Model Evaluation

In [None]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Calculate key evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

# Display evaluation results
print("=== XGBoost Classification Model Evaluation ===")
print(f"Accuracy : {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"F1-score : {f1:.3f}")

# Display a more detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le_icd.classes_, zero_division=0))

Visualization (ROC Curve)

In [None]:
# Calculate prediction probabilities
y_score = pipeline.predict_proba(X_test)
y_test_binarized = pd.get_dummies(y_test)
n_classes = y_test_binarized.shape[1]

# Calculate the ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized.iloc[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Visualize the ROC curve
plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(10, 8))
colors = cycle(['#FF5733', '#33FF57', '#3357FF', '#FF33A1', '#A133FF'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {le_icd.classes_[i]} (area = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Guessing (AUC = 0.50)')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()