In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Drop the first 3 columns (ids)
df = dfData38.drop(columns=["subject_id", "hadm_id", "stay_id"])

# Encode categorical variables
df = pd.get_dummies(df, columns=["dka_type", "gender", "race", "liver_disease"])

# remove space in column name
# df.columns = df.columns.str.replace(" ", "_")

# Fill missing values (if any) # TODO: fill with K-neibor
df.fillna(df.mean(), inplace=True)

# Split data into features (X) and target variable (y)
X = df.drop(columns=["akd"])
y = df["akd"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(X.dtypes)

In [None]:
import xgboost as xgb

# Define parameters for XGBoost
params = {
    "max_depth": 100,  # maximum depth of the tree
    "learning_rate": 0.01,  # learning rate
    # 'objective': 'multi:softmax',  # objective function
    "num_class": len(y.unique()),  # number of classes
    "eval_metric": "merror",  # evaluation metric
}

# Convert training and testing data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Train the XGBoost model
num_rounds = 1000  # number of boosting rounds
model = xgb.train(params, dtrain, num_rounds)

In [None]:
# Make predictions
y_pred = model.predict(dtest)


# Predict on the testing data
# y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
import sklearn.metrics as metrics

y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred_binary)
auc = metrics.roc_auc_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred_binary)
recall = metrics.recall_score(y_test, y_pred_binary)
f1 = metrics.f1_score(y_test, y_pred_binary)


# Print evaluation metrics
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = metrics.confusion_matrix(y_test, y_pred_binary)

# Plot confusion matrix
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Predicted Negative", "Predicted Positive"],
    yticklabels=["Actual Negative", "Actual Positive"],
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.show()

## LassoCV

In [None]:
from sklearn.linear_model import LassoCV

# Feature selection using LassoCV
lasso_cv = LassoCV(cv=5, random_state=7)
lasso_cv.fit(X, y)
selected_features = X.columns[lasso_cv.coef_ != 0]
lasso_cv.coef_

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

# Splitting data into training and validation cohorts
X_train, X_valid, y_train, y_valid = train_test_split(
    X[selected_features], y, test_size=0.15, random_state=42
)

# Model selection and evaluation
models = {
    "XGBoost": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Light BGM": GradientBoostingClassifier(),  # Assuming Light BGM is Light Gradient Boosting Machine
    "Ada Boost": AdaBoostClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Multi-layer Perceptron": MLPClassifier(),
    "Complement Naive Bayes": ComplementNB(),
    "Support Vector Machine": SVC(probability=True),
}

best_model = None
best_auc = 0

for name, model in models.items():
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=KFold(n_splits=10, shuffle=True, random_state=42),
        scoring="roc_auc",
    )
    avg_auc = np.mean(cv_scores)
    if avg_auc > best_auc:
        best_auc = avg_auc
        best_model = model

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_valid)
y_pred_proba = best_model.predict_proba(X_valid)[:, 1]

# Model evaluation
auc = roc_auc_score(y_valid, y_pred_proba)
accuracy = accuracy_score(y_valid, y_pred)
tn, fp, fn, tp = confusion_matrix(y_valid, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(f"AUC: {auc}")
print(f"Accuracy: {accuracy}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")

# Feature importance
if hasattr(best_model, "feature_importances_"):
    feature_importance = pd.DataFrame(
        {"Feature": selected_features, "Importance": best_model.feature_importances_}
    )
    print(feature_importance)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder
import xgboost as xgb

estimators = [
    ("encoder", TargetEncoder()),
    ("clf", xgb.XGBClassifier(random_state=8)),
]

pipe = Pipeline(steps=estimators, verbose=True)
pipe

In [None]:
from skopt import Gr
from skopt.space import Real, Categorical, Integer


searchSpace = {
    "clf__max_depth": Integer(5, 11),
    "clf__learning_rate": Real(0.001, 1.0, prior="log-uniform"),
    "clf__subsample": Real(0.5, 1.0),
    "clf__colsample_bytree": Real(0.0, 0.5),
    "clf__colsample_bylevel": Real(0.5, 1.0),
    "clf__colsample_bynode": Real(0.0, 0.5),
    "clf__reg_alpha": Real(0.0, 10.0),
    "clf__reg_lambda": Real(0.0, 10.0),
    "clf__gamma": Real(0.0, 10.0),
}

opt = BayesSearchCV(
    pipe, search_spaces=searchSpace, cv=3, n_iter=50, scoring="roc_auc", random_state=7
)

opt.fit(X, y)

In [None]:
opt.best_estimator_

In [None]:
opt.best_score_

In [None]:
opt.score(X_test, y_test)

In [None]:
opt.best_estimator_.steps