# ML  For RBC Transfusion during Cardiac Surgey With CPB

In [1]:
import pandas as pd
import numpy as np 
from numpy import mean
from numpy import std
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline


In [4]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score, 
                             f1_score,
                             roc_auc_score, 
                             precision_recall_curve,
                             balanced_accuracy_score,
                             auc)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Loading Dataset

In [7]:
dataset = pd.read_excel(r'Datasets.xlsx',sheet_name="FW_lassodataset")

In [8]:
data_=dataset.iloc[:,1:]

In [9]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   2420 non-null   float64
 1   BMI                   2420 non-null   float64
 2   temp                  2420 non-null   float64
 3   HR                    2420 non-null   int64  
 4   SD                    2415 non-null   float64
 5   WBC                   2420 non-null   float64
 6   Hgb                   2420 non-null   int64  
 7   PLT                   2420 non-null   int64  
 8   ALT                   2418 non-null   float64
 9   AST                   2418 non-null   float64
 10  ALP                   2410 non-null   float64
 11  TBil                  2419 non-null   float64
 12  SCr                   2420 non-null   float64
 13  BUN                   2419 non-null   float64
 14  ALB                   2419 non-null   float64
 15  gender               

In [10]:
features = data_.iloc[:,:-1] 
target = data_.iloc[:,-1]

# Statistic Split

In [11]:
features_trainval,features_test,target_trainval,target_test= train_test_split(
    features,target,
    test_size = 0.2,random_state=42)

In [12]:

numerical_cols = data_.iloc[:, :16].columns.to_list()


categorical_cols = data_.iloc[:, 16:-1].columns.to_list()


my_cols = numerical_cols + categorical_cols

X_train = features_trainval[my_cols].copy()
X_test = features_test[my_cols].copy()

# Statistic Imputation

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [14]:
steps = []

steps.append(('preprocessor', preprocessor))

my_pipeline = Pipeline(steps=steps)


my_pipeline.fit(X_train)
my_pipeline.fit(X_test)

X_ttrain = my_pipeline.transform(X_train)
X_ttest = my_pipeline.transform(X_test)

In [15]:
X_Train = pd.DataFrame(X_ttrain, columns=my_cols)
X_Test = pd.DataFrame(X_ttest, columns=my_cols)

In [16]:
features_trainval = X_Train
features_test = X_Test

## Data Normalization 

In [17]:
scaler = StandardScaler().fit(features_trainval)
rescaledtrainX_raw = scaler.transform(features_trainval)
rescaledtestX_raw = scaler.transform(features_test)

## Feature Selection 
## see the feature selection part

## Keep the results of feature selection

In [18]:
scaler = StandardScaler().fit(features_trainval)
rescaledtrainX = scaler.transform(features_trainval)
rescaledtestX = scaler.transform(features_test)

# Model Hyperparamater

## see the model hyperparameter part

# Final Models

## LR

In [19]:
model_lr = LogisticRegression(C=0.1,penalty='l2')
model_lr.fit(rescaledtrainX, target_trainval)

# predictions_lr = model_lr.predict(rescaledtestX)
# print(accuracy_score(target_test, predictions_lr))
# print(confusion_matrix(target_test, predictions_lr))
# print(classification_report(target_test, predictions_lr))

## SVM 

In [20]:
model_svm = SVC(C=0.01,kernel="linear",probability=True)
model_svm.fit(rescaledtrainX, target_trainval)
# predictions_svm = model_svm.predict(rescaledtestX)

# print(accuracy_score(target_test, predictions_svm))
# print(confusion_matrix(target_test, predictions_svm))
# print(classification_report(target_test, predictions_svm))

## KNN

In [21]:

model_knn = KNeighborsClassifier(n_neighbors=25)
model_knn.fit(rescaledtrainX, target_trainval)

# predictions_knn = model_knn.predict(rescaledtestX)
# print(accuracy_score(target_test, predictions_knn))
# print(confusion_matrix(target_test, predictions_knn))
# print(classification_report(target_test, predictions_knn))

## BAY

In [22]:
model_bay = GaussianNB(priors=[0.7, 0.3], var_smoothing=1e-08)
model_bay.fit(rescaledtrainX, target_trainval)
# predictions_bay = model_bay.predict(rescaledtestX)
# print(accuracy_score(target_test, predictions_bay))
# print(confusion_matrix(target_test, predictions_bay))
# print(classification_report(target_test, predictions_bay))

## DT

In [23]:
model_dt = DecisionTreeClassifier(criterion="gini",max_depth=20)
model_dt.fit(rescaledtrainX, target_trainval)


## RF

In [24]:
model_rf = RandomForestClassifier(max_depth=9, n_estimators=200)
model_rf.fit(rescaledtrainX, target_trainval)

# predictions_rf = model_rf.predict(rescaledtestX)
# print(accuracy_score(target_test, predictions_rf))
# print(confusion_matrix(target_test, predictions_rf))
# print(classification_report(target_test, predictions_rf))

## gb

In [25]:
model_gb = GradientBoostingClassifier(learning_rate=0.02,n_estimators=200,max_depth=6)
model_gb.fit(rescaledtrainX, target_trainval)


## xgb 

In [26]:
model_xgb = xgb.XGBClassifier(learning_rate=0.02,max_depth=5, n_estimators=150)
model_xgb.fit(rescaledtrainX, target_trainval)

## LGB

In [27]:
model_lgb = lgb.LGBMClassifier(learning_rate=0.01,max_depth=6, n_estimators=300)
model_lgb.fit(rescaledtrainX, target_trainval)


## CAT

In [28]:
model_cat = cb.CatBoostClassifier(max_depth=8, iterations=500,learning_rate=0.01,verbose=False)
model_cat.fit(rescaledtrainX, target_trainval)

<catboost.core.CatBoostClassifier at 0x207f2dc81c0>

## AB

In [29]:
model_ab = AdaBoostClassifier(learning_rate=0.5,n_estimators=100)
model_ab.fit(rescaledtrainX, target_trainval)

## ex

In [30]:
model_ex = ExtraTreesClassifier(criterion="gini",n_estimators=400,max_depth=9)
model_ex.fit(rescaledtrainX, target_trainval)

# Result

## Plot ROC Curve

In [34]:
plt.title("ROC  curve in Testing Dataset")

RocCurveDisplay.from_estimator(model_lr, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_svm, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_knn, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_dt, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_rf, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_bay, rescaledtestX, target_test,ax=plt.gca())

RocCurveDisplay.from_estimator(model_gb, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_xgb, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_lgb, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_cat, rescaledtestX, target_test,ax=plt.gca())

RocCurveDisplay.from_estimator(model_ab, rescaledtestX, target_test,ax=plt.gca())
RocCurveDisplay.from_estimator(model_ex, rescaledtestX, target_test,ax=plt.gca())

plt.legend()

plt.rcParams['figure.figsize'] = (8, 6)

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')


plt.show()

## Plot Precision Recall Curve

In [35]:
from sklearn.metrics import (precision_recall_curve,PrecisionRecallDisplay)

plt.title("Precision_Recall curve in Testing Dataset")


PrecisionRecallDisplay.from_estimator(model_lr, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_svm, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_knn, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_dt, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_rf, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_bay, rescaledtestX, target_test,ax=plt.gca())

PrecisionRecallDisplay.from_estimator(model_gb, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_xgb, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_lgb, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_cat, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_ab, rescaledtestX, target_test,ax=plt.gca())
PrecisionRecallDisplay.from_estimator(model_ex, rescaledtestX, target_test,ax=plt.gca())


plt.legend()

plt.rcParams['figure.figsize'] = (12, 10)

plt.xlabel('Recall')
plt.ylabel('Precision') 
plt.legend()

plt.show()


## Plot Calibration surve

In [36]:
from sklearn.calibration import CalibrationDisplay

plt.title("Calibaration curve in Testing Dataset")


CalibrationDisplay.from_estimator(model_lr, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_svm, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_knn, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_dt, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_rf, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_bay, rescaledtestX, target_test,ax=plt.gca())

CalibrationDisplay.from_estimator(model_gb, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_xgb, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_lgb, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_cat, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_ab, rescaledtestX, target_test,ax=plt.gca())
CalibrationDisplay.from_estimator(model_ex, rescaledtestX, target_test,ax=plt.gca())


plt.legend()

plt.rcParams['figure.figsize'] = (12, 10)
 
plt.legend()
plt.show()


## Other Evaluation Index

In [37]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clf_list = [
    (model_lr, "Logistic Regression"),
    (model_svm, "Support Vector Machine"),
    (model_knn, "KNeighborsClassifierbor"),
    (model_dt,"DecisionTreeClassifier"),
    (model_rf, "RandomForestClassifier"),
    (model_bay, "GaussianNB"),
    (model_gb, "GradientBoostingClassifier"),
    (model_xgb, "XGBClassifier"),
    (model_lgb, "LGBMClassifier"),
    (model_cat, "CatboostClassifier"),
    (model_ab, "AdaBoostClassifier"),
    (model_ex, "ExtraTreesClassifier")
]

In [38]:
from collections import defaultdict

import pandas as pd

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    brier_score_loss,
    log_loss,
    roc_auc_score,
    accuracy_score
)

scores = defaultdict(list)
for i, (clf, name) in enumerate(clf_list):

    y_prob = clf.predict_proba(rescaledtestX)
    y_pred = clf.predict(rescaledtestX)

    
    scores["Classifier"].append(name)

    for metric in [roc_auc_score,brier_score_loss, log_loss]:
        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
        scores[score_name].append(metric(target_test, y_prob[:, 1]))

    for metric in [accuracy_score,precision_score, recall_score, f1_score]:
        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
        scores[score_name].append(metric(target_test, y_pred))

    score_df_nofs = pd.DataFrame(scores).set_index("Classifier")
    score_df_nofs.round(decimals=3)


In [39]:
# score_df_nofs

# Save Models

In [196]:
import pickle
filename = "final_model_LR.sav"
pickle.dump(model_lr,open(filename,"wb"))

filename = "final_model_SVM.sav"
pickle.dump(model_svm,open(filename,"wb"))

filename = "final_model_KNN.sav"
pickle.dump(model_knn,open(filename,"wb"))

filename = "final_model_AB.sav"
pickle.dump(model_ab,open(filename,"wb"))


filename = "final_model_BAY.sav"
pickle.dump(model_bay,open(filename,"wb"))

filename = "final_model_DT.sav"
pickle.dump(model_dt,open(filename,"wb"))


filename = "final_model_RF.sav"
pickle.dump(model_rf,open(filename,"wb"))

filename = "final_model_GB.sav"
pickle.dump(model_gb,open(filename,"wb"))

filename = "final_model_LGB.sav"
pickle.dump(model_lgb,open(filename,"wb"))

filename = "final_model_XGB.sav"
pickle.dump(model_xgb,open(filename,"wb"))

filename = "final_model_CAT.sav"
pickle.dump(model_cat,open(filename,"wb"))

filename = "final_model_EX.sav"
pickle.dump(model_ex,open(filename,"wb"))

# Model Explaination ——SHAP

## The code are in the model expalination part

# Decison Curve

## The code are in the decision curve part