In [160]:
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

df = pd.read_csv("heart.csv")
string_col = df.select_dtypes(include="object").columns
df[string_col] = df[string_col].astype("string")

string_col = df.select_dtypes("string").columns.to_list()
num_col = df.columns.to_list()
for col in string_col:
    num_col.remove(col)
num_col.remove("HeartDisease")

encoders =  {col: LabelEncoder().fit(df[col]) for col in string_col}
df_tree = df.copy()

for col, encoder in encoders.items():
    df_tree[col] = encoder.transform(df[col])

# f_tree = df.apply(LabelEncoder().fit_transform)

target = "HeartDisease"
y = df_tree[target].values

feature_col_tree = df_tree.columns.to_list()
feature_col_tree.remove(target)


kf = model_selection.StratifiedKFold(n_splits=5)
acc_XGB = []

for fold, (trn_, val_) in enumerate(kf.split(X=df_tree, y=y)):
    X_train = df_tree.iloc[trn_, df_tree.columns.get_indexer(feature_col_tree)]
    y_train = df_tree.iloc[trn_, df_tree.columns.get_loc(target)]

    X_valid = df_tree.iloc[val_, df_tree.columns.get_indexer(feature_col_tree)]
    y_valid = df_tree.iloc[val_, df_tree.columns.get_loc(target)]

    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    print(f"The fold is : {fold + 1} : ")
    print(classification_report(y_valid, y_pred))
    acc = roc_auc_score(y_valid, y_pred)
    acc_XGB.append(acc)
    print(f"The accuracy for {fold + 1} : {acc}")
    
pickle.dump(clf, open("xgboostModel.pkl", "wb"))
pickle.dump(encoders, open("label_encoders.pkl", "wb"))

The fold is : 1 : 
              precision    recall  f1-score   support

           0       0.78      0.93      0.84        82
           1       0.93      0.78      0.85       102

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

The accuracy for 1 : 0.8555714968914395
The fold is : 2 : 
              precision    recall  f1-score   support

           0       0.77      0.91      0.84        82
           1       0.92      0.78      0.85       102

    accuracy                           0.84       184
   macro avg       0.85      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

The accuracy for 2 : 0.8494739359158296
The fold is : 3 : 
              precision    recall  f1-score   support

           0       0.96      0.61      0.75        82
           1       0.76      0.98      0.85       102

    accuracy                           0.82  

In [None]:
input = {
    # Input Parameters
    "Age" : 39,              # Age (years)

    "Sex" : 'F',             # 'M' (Male) or 'F' (Female)

    "ChestPainType" : 'ASY', # 'TA' (Typical Angina), 'ATA' (Atypical Angina), 
                             # 'NAP' (Non-Anginal Pain), 'ASY' (Asymptomatic)

    "RestingBP" : 80,       # Resting Blood Pressure (mm Hg)

    "Cholesterol" : 50,     # Serum Cholesterol (mm/dl)

    "FastingBS" : 0,         # Fasting Blood Sugar (mg/dl)
                             # 1 if FastingBS > 120, otherwise 0

    "RestingECG" : 'Normal', # Resting Electrocardiogram Results
                             # 'Normal': Normal, 
                             # 'ST':     having ST-T wave abnormality 
                             # 'LVH':    showing probable or definite left 
                             #           ventricular hypertrophy by Estes' 
                             #           criteria

    "MaxHR" : 120,           # Max heart rate achieved
                             # (should be between 60 to 202)
                              
    "ExerciseAngina" : 'N',  # Exercise induced angina: 'Y' (Yes) / 'N' (No) 

    "Oldpeak" : 1.0,         # ST (Numeric values measured in depression)
    
    "ST_Slope" : 'Flat'      # Slope of the peak exercise ST segment
                             # 'Up':   upsloping
                             # 'Flat': flat
                             # 'Down': downsloping
}

# Output should be 1 (Heart Disease) by the following model for this input

df_test = pd.DataFrame([input])

encoders = pickle.load(open("label_encoders.pkl", "rb"))
for col, encoder in encoders.items():
    df_test[col] = encoder.transform(df_test[col])

df_test = df_test[feature_col_tree]

rfModel = pickle.load(open("xgboostModel.pkl", "rb"))
print(f"Prediction: {rfModel.predict(df_test)}")
print(f"Probability: {rfModel.predict_proba(df_test)}")
# 0 - Normal
# 1 - HeartDisease

Prediction: [1]
Probability: [[0.00113344 0.99886656]]
