In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from scipy.stats.mstats import winsorize

In [11]:
df=pd.read_csv('data\heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [12]:
df['serum_creatinine'] = winsorize(df['serum_creatinine'], limits=[0, 0.02])
df['creatinine_phosphokinase'] = winsorize(df['creatinine_phosphokinase'], limits=[0, 0.02])

In [13]:
x=df.drop('DEATH_EVENT',axis=1)
y=df['DEATH_EVENT']
#

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=0)

In [15]:

robust_cols = [
    'creatinine_phosphokinase',
    'serum_creatinine',
    'platelets'
]

standard_cols = [
    'age',
    'ejection_fraction',
    'serum_sodium',
    'time'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('robust', RobustScaler(), robust_cols),
        ('standard', StandardScaler(), standard_cols)
    ],
    remainder='passthrough'  # keeps the binary columns as is
)

In [16]:
x_train_scaled = preprocessor.fit_transform(x_train)
x_test_scaled = preprocessor.transform(x_test)

# **Model Training**

In [17]:
models={
    "logistic regression":LogisticRegression(class_weight='balanced'),
    "decision tree":DecisionTreeClassifier(class_weight='balanced'),
    "random forest":RandomForestClassifier(class_weight='balanced'),
    "naive bayes":GaussianNB(),
    "SVS":SVC(),
    "KNN":KNeighborsClassifier(),
    "Ridge Classifier":RidgeClassifier(),
    "Gradient Boosting":GradientBoostingClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "XGBoost":XGBClassifier(),
    "Ridge Classifier":RidgeClassifier(),
}

for i in range (len(list(models))):
  model=list(models.values())[i]
  model.fit(x_train_scaled,y_train)
  y_pred=model.predict(x_test_scaled)

  accuracy=accuracy_score(y_test,y_pred)
  report=classification_report(y_test,y_pred)
  cm=confusion_matrix(y_test,y_pred)
  print(list(models.keys())[i])
  print(list(models.keys())[i],accuracy)
  print(cm)
  print(report)
  print('---------------------------------------------------------------')
#

logistic regression
logistic regression 0.8666666666666667
[[37  4]
 [ 4 15]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        41
           1       0.79      0.79      0.79        19

    accuracy                           0.87        60
   macro avg       0.85      0.85      0.85        60
weighted avg       0.87      0.87      0.87        60

---------------------------------------------------------------
decision tree
decision tree 0.8
[[32  9]
 [ 3 16]]
              precision    recall  f1-score   support

           0       0.91      0.78      0.84        41
           1       0.64      0.84      0.73        19

    accuracy                           0.80        60
   macro avg       0.78      0.81      0.78        60
weighted avg       0.83      0.80      0.81        60

---------------------------------------------------------------
random forest
random forest 0.85
[[36  5]
 [ 4 15]]
              precision    recall  f1

# **Hyperparameter Tuning for Gradient Boosting**

In [18]:

pipeline_gbm = Pipeline([
    ('gbm', HistGradientBoostingClassifier(class_weight='balanced', random_state=42))
])

# 2. Update the Parameter Grid
# IMPORTANT: You must prefix the parameters with 'gbm__' to match the pipeline step name
param_grid_gbm = {
    'gbm__learning_rate': [0.01, 0.05, 0.1],
    'gbm__max_depth': [3, 4, 5],
    'gbm__max_iter': [50, 100, 200], # In HistGBM, n_estimators is called max_iter
    'gbm__l2_regularization': [0.0, 0.1, 1.0]
}

# 3. Setup GridSearchCV
grid_gbm = GridSearchCV(
    estimator=pipeline_gbm,
    param_grid=param_grid_gbm,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

# 4. Fit using your already scaled training data
grid_gbm.fit(x_train_scaled, y_train)

print("Best params:", grid_gbm.best_params_)
print("Best score:", grid_gbm.best_score_)

# 5. Predict and Report
probabilities = grid_gbm.predict_proba(x_test_scaled)[:, 1]

# Set a custom threshold
threshold = 0.25
y_pred_custom = (probabilities >= threshold).astype(int)
print("Gradient Boosting After Tuning:")
print(classification_report(y_test, y_pred_custom))
#

Best params: {'gbm__l2_regularization': 0.1, 'gbm__learning_rate': 0.05, 'gbm__max_depth': 4, 'gbm__max_iter': 100}
Best score: 0.7440018780536022
Gradient Boosting After Tuning:
              precision    recall  f1-score   support

           0       0.94      0.78      0.85        41
           1       0.65      0.89      0.76        19

    accuracy                           0.82        60
   macro avg       0.80      0.84      0.80        60
weighted avg       0.85      0.82      0.82        60

