# Heart Disease Prediction - Group 14

In [20]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

import os
print(os.listdir())

import warnings
warnings.filterwarnings('ignore')

['.git', '.vscode', 'dataset-numerical.csv', 'heartprediction.ipynb', 'README.md', 'venv', '__pycache__']


In [21]:
dataset = pd.read_csv("dataset-numerical.csv")

In [22]:
#dataset eda
print(dataset.shape)
dataset.describe()

(1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [23]:
#total columns and count
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [24]:
#check correlation betweem columns
print(dataset.corr()["target"].abs().sort_values(ascending=False))

target      1.000000
oldpeak     0.438441
exang       0.438029
cp          0.434854
thalach     0.422895
ca          0.382085
slope       0.345512
thal        0.337838
sex         0.279501
age         0.229324
trestbps    0.138772
restecg     0.134468
chol        0.099966
fbs         0.041164
Name: target, dtype: float64


In [25]:
# train test split - 70% train 15% test 15% validation
predictors = dataset.drop("target", axis=1)
target = dataset["target"]

# first - 70% train 30% test
X_train, X_temp, Y_train, Y_temp = train_test_split(
    predictors, target,
    test_size=0.30,
    random_state=42,
    stratify=target
)

# second: val and test
X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp,
    test_size=0.50,      # half of 30% = 15%
    random_state=42,
    stratify=Y_temp
)

#for confirmation
print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


Train size: 717
Validation size: 154
Test size: 154


## Training/testing models without GridSearchCV - parent paper results

In [26]:
#Logistic Regression
lr = LogisticRegression(max_iter=1000)

# train and predict
lr.fit(X_train, Y_train)
Y_pred_lr = lr.predict(X_test)

# eval metrics - rounding for percentage format
accuracy_lr = round(accuracy_score(Y_test, Y_pred_lr) * 100, 2)
precision_lr = round(precision_score(Y_test, Y_pred_lr) * 100, 2)
recall_lr = round(recall_score(Y_test, Y_pred_lr) * 100, 2)
f1_lr = round(f1_score(Y_test, Y_pred_lr) * 100, 2)

print("Logistic Regression Metrics:")
print("Accuracy:  ", accuracy_lr, "%")
print("Precision: ", precision_lr, "%")
print("Recall:    ", recall_lr, "%")
print("F1 Score:  ", f1_lr, "%")


Logistic Regression Metrics:
Accuracy:   80.52 %
Precision:  76.34 %
Recall:     89.87 %
F1 Score:   82.56 %


In [27]:
#SVM
sv = svm.SVC(kernel='linear')

# train and test
sv.fit(X_train, Y_train)
Y_pred_svm = sv.predict(X_test)

#eval metrics - rounding to format in percentage
accuracy_svm = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision_svm = round(precision_score(Y_test, Y_pred_svm) * 100, 2)
recall_svm = round(recall_score(Y_test, Y_pred_svm) * 100, 2)
f1_svm = round(f1_score(Y_test, Y_pred_svm) * 100, 2)

print("SVM metrics")
print("Accuracy: ", accuracy_svm, "%")
print("Precision: ", precision_svm, "%")
print("Recall: ", recall_svm, "%")
print("F1 Score: ", f1_svm, "%")


SVM metrics
Accuracy:  85.06 %
Precision:  80.43 %
Recall:  93.67 %
F1 Score:  86.55 %


In [28]:
#KNN
knn = KNeighborsClassifier(n_neighbors=7)

#train and test
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)

#metrics - rounding for percentage format
accuracy_knn = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision_knn = round(precision_score(Y_test, Y_pred_knn) * 100, 2)
recall_knn = round(recall_score(Y_test, Y_pred_knn) * 100, 2)
f1_knn = round(f1_score(Y_test, Y_pred_knn) * 100, 2)

print("KNN eval")
print("Accuracy: ", accuracy_knn, "%")
print("Precision: ", precision_knn, "%")
print("Recall: ", recall_knn, "%")
print("F1 Score: ", f1_knn, "%")


KNN eval
Accuracy:  75.97 %
Precision:  76.25 %
Recall:  77.22 %
F1 Score:  76.73 %


In [29]:
#XGBoost
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

#train and test
xgb_model.fit(X_train, Y_train)
Y_pred_xgb = xgb_model.predict(X_test)

# eval metrics - rounding for percentage format
accuracy_xgb = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision_xgb = round(precision_score(Y_test, Y_pred_xgb) * 100, 2)
recall_xgb = round(recall_score(Y_test, Y_pred_xgb) * 100, 2)
f1_xgb = round(f1_score(Y_test, Y_pred_xgb) * 100, 2)

print("XGBoost metrics:")
print("Accuracy: ", accuracy_xgb, "%")
print("Precision: ", precision_xgb, "%")
print("Recall: ", recall_xgb, "%")
print("F1 Score: ", f1_xgb, "%")


XGBoost metrics:
Accuracy:  98.05 %
Precision:  100.0 %
Recall:  96.2 %
F1 Score:  98.06 %


In [30]:
# Random Forest
rf = RandomForestClassifier(random_state=42)

# Train and test
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

# eval metrics - rounding for formatting 
accuracy_rf = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision_rf = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall_rf = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1_rf = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

print("Random Forest eval:")
print("Accuracy: ", accuracy_rf, "%")
print("Precision: ", precision_rf, "%")
print("Recall: ", recall_rf, "%")
print("F1 Score: ", f1_rf, "%")


Random Forest eval:
Accuracy:  96.75 %
Precision:  100.0 %
Recall:  93.67 %
F1 Score:  96.73 %


In [31]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)

#train and test
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)

# Metrics - rounding for percentage format
accuracy_dt = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision_dt = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall_dt = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1_dt = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

print("Decision Tree eval:")
print("Accuracy: ", accuracy_dt, "%")
print("Precision: ", precision_dt, "%")
print("Recall: ", recall_dt, "%")
print("F1 Score: ", f1_dt, "%")


Decision Tree eval:
Accuracy:  98.05 %
Precision:  100.0 %
Recall:  96.2 %
F1 Score:  98.06 %


In [32]:
#Phase 1 results summary

# dictionary for easy formatting into table
results = {
    "Model": [
        "Logistic Regression",
        "Support Vector Machine",
        "K-Nearest Neighbors",
        "XGBoost",
        "Random Forest",
        "Decision Tree"
    ],
    "Accuracy (%)": [accuracy_lr, accuracy_svm, accuracy_knn, accuracy_xgb, accuracy_rf, accuracy_dt],
    "Precision":    [precision_lr, precision_svm, precision_knn, precision_xgb, precision_rf, precision_dt],
    "Recall":       [recall_lr, recall_svm, recall_knn, recall_xgb, recall_rf, recall_dt],
    "F1 Score":     [f1_lr, f1_svm, f1_knn, f1_xgb, f1_rf, f1_dt]
}

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))


                 Model  Accuracy (%)  Precision  Recall  F1 Score
   Logistic Regression         80.52      76.34   89.87     82.56
Support Vector Machine         85.06      80.43   93.67     86.55
   K-Nearest Neighbors         75.97      76.25   77.22     76.73
               XGBoost         98.05     100.00   96.20     98.06
         Random Forest         96.75     100.00   93.67     96.73
         Decision Tree         98.05     100.00   96.20     98.06


## Training/testing models with GridSearch CV - parent paper

In [33]:
# logistic regresson - grid search
lr = LogisticRegression(max_iter=1000) # large number for value convergence

params_lr = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # type of regularization
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # inverse regularization strength
    'solver': ['liblinear', 'saga'], # optimization algorithm
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1],  # L1/L2 values 
    'class_weight': [None, 'balanced']   #handle class imbalance
}

grid_search_lr = GridSearchCV(
    estimator=lr,
    param_grid=params_lr,
    cv=5,
    scoring='accuracy', #optimization metric
    n_jobs=-1,
    verbose=2 # explains progress in gridsearch
)

#train and test
grid_search_lr.fit(X_train, Y_train)
best_lr = grid_search_lr.best_estimator_ # tests on best param combination
Y_pred_lr = best_lr.predict(X_test)

#eval metrics
accuracy_lr2  = round(accuracy_score(Y_test, Y_pred_lr) * 100, 2)
precision_lr2 = round(precision_score(Y_test, Y_pred_lr)* 100, 2)
recall_lr2 = round(recall_score(Y_test, Y_pred_lr) * 100, 2)
f1_lr2 = round(f1_score(Y_test, Y_pred_lr) * 100, 2)

print("Logistic Regression eval:")
print("Accuracy: ", accuracy_lr2)
print("Precision: ", precision_lr2)
print("Recall: ", recall_lr2)
print("F1 Score: ", f1_lr2)
print("Best Params:", grid_search_lr.best_params_)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Logistic Regression eval:
Accuracy:  80.52
Precision:  76.34
Recall:  89.87
F1 Score:  82.56
Best Params: {'C': 10, 'class_weight': None, 'l1_ratio': 0, 'penalty': 'l1', 'solver': 'liblinear'}


In [34]:
#svm - grid search
svmm = svm.SVC(kernel='linear')

# smaller grid for faster search
params_svm = {
    'C': [0.1, 1, 10, 100],  # regularization strength, larger = stricter margin
    'class_weight': [None, 'balanced']  #handle for class imbalance
}

grid_search_svm = GridSearchCV(
    estimator=svmm,
    param_grid=params_svm,
    cv=5,
    scoring='accuracy', #uses accuracy for optimization
    n_jobs=-1,
    verbose=2 # explains progress
)

# train and test
grid_search_svm.fit(X_train, Y_train)
best_svm = grid_search_svm.best_estimator_ # tests on best param combination
Y_pred_svm = best_svm.predict(X_test)

# eval metrics
accuracy_svm2  = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision_svm2 = round(precision_score(Y_test, Y_pred_svm) * 100, 2)
recall_svm2 = round(recall_score(Y_test, Y_pred_svm) * 100, 2)
f1_svm2 = round(f1_score(Y_test, Y_pred_svm) * 100, 2)

print("Linear SVM eval results:")
print("Accuracy: ", accuracy_svm2)
print("Precision: ", precision_svm2)
print("Recall: ", recall_svm2)
print("F1 Score: ", f1_svm2)
print("Best Params:", grid_search_svm.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Linear SVM eval results:
Accuracy:  85.06
Precision:  80.43
Recall:  93.67
F1 Score:  86.55
Best Params: {'C': 1, 'class_weight': None}


In [35]:
# knn - grid search
knn = KNeighborsClassifier()

params_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 21], # number of nearest neighbors
    'weights': ['uniform', 'distance'], # type of weight being used
    'metric': ['euclidean', 'manhattan', 'minkowski'], # type of measurment for distance
    'p': [1, 2] 
}

grid_search_knn = GridSearchCV(
    estimator=knn,
    param_grid=params_knn,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_knn.fit(X_train, Y_train)
best_knn = grid_search_knn.best_estimator_ # test on best param combinaton
Y_pred_knn = best_knn.predict(X_test)

# eval metrics
accuracy_knn2 = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision_knn2 = round(precision_score(Y_test, Y_pred_knn, average='binary') * 100, 2)
recall_knn2 = round(recall_score(Y_test, Y_pred_knn, average='binary') * 100, 2)
f1_knn2 = round(f1_score(Y_test, Y_pred_knn, average='binary') * 100, 2)

# results
print(f"Best KNN Parameters: {grid_search_knn.best_params_}")
print(f"Accuracy: {accuracy_knn2} %")
print(f"Precision: {precision_knn2} %")
print(f"Recall: {recall_knn2} %")
print(f"F1-Score: {f1_knn2} %")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
Accuracy: 96.75 %
Precision: 100.0 %
Recall: 93.67 %
F1-Score: 96.73 %


In [36]:
#xgboost - grid search
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42, # for consistency
    n_jobs=-1
)

params_xgb = {
    'n_estimators': [100, 200], # number of boosting trees
    'max_depth': [3, 5],  # depth of each tree
    'learning_rate': [0.05, 0.1],  # step size shrinkage
    'subsample': [0.8, 1.0],  # % of samples per tree
    'colsample_bytree': [0.8, 1.0], # % of features per tree
    'gamma': [0, 0.1],    # min loss reduction for a split
    'min_child_weight': [1, 3]   # min sum of instance weights in a leaf
}


grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=params_xgb,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

# train and test
grid_search_xgb.fit(X_train, Y_train)
best_xgb = grid_search_xgb.best_estimator_ # uses best param combination
Y_pred_xgb = best_xgb.predict(X_test)

# eval metrics
accuracy_xgb2 = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision_xgb2 = round(precision_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)
recall_xgb2 = round(recall_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)
f1_xgb2 = round(f1_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)

#  results
print(f"Best XGBoost Parameters: {grid_search_xgb.best_params_}")
print(f"Accuracy: {accuracy_xgb2} %")
print(f"Precision: {precision_xgb2} %")
print(f"Recall: {recall_xgb2} %")
print(f"F1-Score: {f1_xgb2} %")


Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best XGBoost Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 96.75 %
Precision: 97.44 %
Recall: 96.2 %
F1-Score: 96.82 %


In [37]:
#random forest - grid search
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

param_grid_rf = {
    'n_estimators': [100, 200, 300, 500], # number of trees
    'max_depth': [None, 5, 10, 15],  # max depth of each tree
    'min_samples_split': [2, 5, 10],  # min samples to split a node
    'min_samples_leaf': [1, 2, 4], # min samples in a leaf
    'max_features': ['sqrt', 'log2', None], # features considered per split
    'bootstrap': [True, False] # use bootstrapped samples
}

grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid_rf,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_rf.fit(X_train, Y_train)
best_rf = grid_search_rf.best_estimator_ # best param combination
Y_pred_rf = best_rf.predict(X_test)

# eval metrics
accuracy_rf2 = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision_rf2 = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall_rf2 = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1_rf2 = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

# results
print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_rf2} %")
print(f"Precision: {precision_rf2} %")
print(f"Recall: {recall_rf2} %")
print(f"F1-Score: {f1_rf2} %")


Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Best Random Forest Parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 96.75 %
Precision: 100.0 %
Recall: 93.67 %
F1-Score: 96.73 %


In [38]:
# decision tree -grid search
dt_clf = DecisionTreeClassifier(random_state=42)

param_grid_dt = {
    'max_depth': [None, 3, 5, 7, 10], # tree depth
    'min_samples_split': [2, 5, 10], # min samples to split a node
    'min_samples_leaf': [1, 2, 4],  # min samples in a leaf
    'max_features': [None, 'sqrt', 'log2'],# features considered per split
    'criterion': ['gini', 'entropy']  # impurity measure
}


grid_search_dt = GridSearchCV(
    estimator=dt_clf,
    param_grid=param_grid_dt,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_dt.fit(X_train, Y_train)
best_dt = grid_search_dt.best_estimator_ # best param combination
Y_pred_dt = best_dt.predict(X_test)

# eval metrics
accuracy_dt2 = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision_dt2 = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall_dt2 = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1_dt2 = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

# Print results
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Accuracy: {accuracy_dt2} %")
print(f"Precision: {precision_dt2} %")
print(f"Recall: {recall_dt2} %")
print(f"F1-Score: {f1_dt2} %")


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 94.81 %
Precision: 96.1 %
Recall: 93.67 %
F1-Score: 94.87 %


In [39]:
#Phase 2 results summary

# dictionary for easy formatting into table
results = {
    "Model": [
        "Logistic Regression",
        "Support Vector Machine",
        "K-Nearest Neighbors",
        "XGBoost",
        "Random Forest",
        "Decision Tree"
    ],
    "Accuracy (%)": [accuracy_lr2, accuracy_svm2, accuracy_knn2, accuracy_xgb2, accuracy_rf2, accuracy_dt2],
    "Precision":    [precision_lr2, precision_svm2, precision_knn2, precision_xgb2, precision_rf2, precision_dt2],
    "Recall":       [recall_lr2, recall_svm2, recall_knn2, recall_xgb2, recall_rf2, recall_dt2],
    "F1 Score":     [f1_lr2, f1_svm2, f1_knn2, f1_xgb2, f1_rf2, f1_dt2]
}

results2_df = pd.DataFrame(results)
print(results2_df.to_string(index=False))

                 Model  Accuracy (%)  Precision  Recall  F1 Score
   Logistic Regression         80.52      76.34   89.87     82.56
Support Vector Machine         85.06      80.43   93.67     86.55
   K-Nearest Neighbors         96.75     100.00   93.67     96.73
               XGBoost         96.75      97.44   96.20     96.82
         Random Forest         96.75     100.00   93.67     96.73
         Decision Tree         94.81      96.10   93.67     94.87


## Training models with grid search and pchf feature selection

In [40]:
# function for implementing PCHF
from sklearn.preprocessing import StandardScaler

def pchf(X, n_features=8, n_components=4):
    # standardize features to determine variance on same scale - full dataset
    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    
    # calculate variances of each feature and sort to find the top
    feature_variances = X_scaled.var().sort_values(ascending=False)
    selected_features = feature_variances.head(n_features).index.tolist()
    X_top = X_scaled[selected_features].values 
    
    # find covariance matrix and eigen decomposition to find how features relate to each other
    cov_matrix = np.cov(X_top, rowvar=False)  # shape: (n_features, n_features)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    # selects most informative and influential components
    sorted_idx = np.argsort(eigenvalues)[::-1]
    top_eigenvectors = eigenvectors[:, sorted_idx[:n_components]] 
    
    # completes final linear transformation
    X_transformed = np.dot(X_top, top_eigenvectors) # value mainly for visualization purpose (understand what will happen later)
    
    return X_transformed, selected_features, top_eigenvectors # X_transformed is discarded later, focus on remainng outputs


In [41]:
#apply pchf to data - recieve matrix and feautures for proper scaling
X_train_transformed, selected_features, transform_matrix = pchf( X_train, n_features=8, n_components=4
)

#scale again - for selected feaures
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix) # transformation completed with variance assessment and selected features

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix) # same transformation on test data

In [42]:
#logistic regression - grid search and pchf

#lr
logreg = LogisticRegression(
    random_state=42, # for repetition
    max_iter=1000, # convergence
    class_weight='balanced' # handles imbalance (replaces SMOTE)
)

params_logreg = {
    'C': [0.1, 1, 10, 100], # inverse regularization strength
    'penalty': ['l1', 'l2'], # type of regularization
    'solver': ['liblinear']  # optimization algorithm for l1 and l2
}

grid_search_logreg = GridSearchCV(
    estimator=logreg,
    param_grid=params_logreg,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_logreg.fit(X_train_transformed, Y_train) # train on transformed
best_logreg = grid_search_logreg.best_estimator_ # best params
Y_pred_logreg = best_logreg.predict(X_test_transformed) # test on transformed

accuracy_lr3 = round(accuracy_score(Y_test, Y_pred_logreg) * 100, 2)
precision_lr3 = round(precision_score(Y_test, Y_pred_logreg) * 100, 2)
recall_lr3 = round(recall_score(Y_test, Y_pred_logreg) * 100, 2)
f1_lr3 = round(f1_score(Y_test, Y_pred_logreg) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Logistic Regression Parameters: {grid_search_logreg.best_params_}")
print(f"Accuracy: {accuracy_lr3} %")
print(f"Precision: {precision_lr3} %")
print(f"Recall: {recall_lr3} %")
print(f"F1-Score: {f1_lr3} %")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best Logistic Regression Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 77.27 %
Precision: 76.19 %
Recall: 81.01 %
F1-Score: 78.53 %


In [43]:
#svm - grid search and pchf
svm_model = svm.SVC(random_state=42, class_weight='balanced', probability=True)

params_svm = {
    'C': [0.1, 1, 10, 100], # regularization strength (larger = stricter margin)
    'kernel': ['linear', 'rbf', 'poly'], # type of kernel function
    'gamma': ['scale', 'auto'] # kernel coefficients 
}

grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=params_svm,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_svm.fit(X_train_transformed, Y_train) # train on transformed 
best_svm = grid_search_svm.best_estimator_
Y_pred_svm = best_svm.predict(X_test_transformed) # test on transformed

accuracy_svm3 = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision_svm3 = round(precision_score(Y_test, Y_pred_svm) * 100, 2)
recall_svm3 = round(recall_score(Y_test, Y_pred_svm) * 100, 2)
f1_svm3 = round(f1_score(Y_test, Y_pred_svm) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best SVM Parameters: {grid_search_svm.best_params_}")
print(f"Accuracy: {accuracy_svm3} %")
print(f"Precision: {precision_svm3} %")
print(f"Recall: {recall_svm3} %")
print(f"F1-Score: {f1_svm3} %")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best SVM Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy: 90.91 %
Precision: 90.12 %
Recall: 92.41 %
F1-Score: 91.25 %


In [44]:
#knn - grid search and pchf
knn_clf = KNeighborsClassifier()

params_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 21], # number of nearest neighbors
    'weights': ['uniform', 'distance'], # weighted calc
    'metric': ['euclidean', 'manhattan', 'minkowski'], # distance calculation
    'p': [1, 2]  # p=1 - Manhattan, p=2 - Euclidean
}

grid_search_knn = GridSearchCV(
    estimator=knn_clf,
    param_grid=params_knn,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

# train and test
grid_search_knn.fit(X_train_transformed, Y_train) # train on tranformed
best_knn = grid_search_knn.best_estimator_ 
Y_pred_knn = best_knn.predict(X_test_transformed) # test on transformed

accuracy_knn3 = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision_knn3 = round(precision_score(Y_test, Y_pred_knn) * 100, 2)
recall_knn3 = round(recall_score(Y_test, Y_pred_knn) * 100, 2)
f1_knn3 = round(f1_score(Y_test, Y_pred_knn) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best KNN Parameters: {grid_search_knn.best_params_}")
print(f"Accuracy: {accuracy_knn3} %")
print(f"Precision: {precision_knn3} %")
print(f"Recall: {recall_knn3} %")
print(f"F1-Score: {f1_knn3} %")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 21, 'p': 1, 'weights': 'distance'}
Accuracy: 98.05 %
Precision: 100.0 %
Recall: 96.2 %
F1-Score: 98.06 %


In [45]:
#xgboost - grid search and pchf
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

params_xgb = {
    'n_estimators': [100, 200], # number of boosting trees
    'max_depth': [3, 5],   # max depth of each tree
    'learning_rate': [0.05, 0.1], # step size shrinkage for weight updates
    'subsample': [0.8, 1.0],   # fraction of samples used per tree
    'colsample_bytree': [0.8, 1.0],  # fraction of features used per tree
    'gamma': [0, 0.1], # min loss reduction required to split
    'min_child_weight': [1, 3]  # min sum of instance weights in a child
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=params_xgb,
    cv=5,
    scoring='accuracy', # optimized metrix
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_xgb.fit(X_train_transformed, Y_train) # train on transformed data
best_xgb = grid_search_xgb.best_estimator_
Y_pred_xgb = best_xgb.predict(X_test_transformed) # test on transformed data

accuracy_xgb3 = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision_xgb3 = round(precision_score(Y_test, Y_pred_xgb) * 100, 2)
recall_xgb3 = round(recall_score(Y_test, Y_pred_xgb) * 100, 2)
f1_xgb3 = round(f1_score(Y_test, Y_pred_xgb) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best XGBoost Parameters: {grid_search_xgb.best_params_}")
print(f"Accuracy: {accuracy_xgb3} %")
print(f"Precision: {precision_xgb3} %")
print(f"Recall: {recall_xgb3} %")
print(f"F1-Score: {f1_xgb3} %")



Fitting 5 folds for each of 128 candidates, totalling 640 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best XGBoost Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 98.05 %
Precision: 100.0 %
Recall: 96.2 %
F1-Score: 98.06 %


In [46]:
#random forest - grid search and pchf
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')

params_rf = {
    'n_estimators': [100, 200, 300], # num trees in forest
    'max_depth': [None, 5, 10, 15], # max depth of tree
    'min_samples_split': [2, 5, 10], # min samples required to split a node
    'min_samples_leaf': [1, 2, 4], # min samples required at a leaf node
    'max_features': ['sqrt', 'log2', None]  # num features considered for each split
}

grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_rf.fit(X_train_transformed, Y_train) # training on transformed data
best_rf = grid_search_rf.best_estimator_
Y_pred_rf = best_rf.predict(X_test_transformed) # test on transformed 

accuracy_rf3 = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision_rf3 = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall_rf3 = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1_rf3 = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_rf3} %")
print(f"Precision: {precision_rf3} %")
print(f"Recall: {recall_rf3} %")
print(f"F1-Score: {f1_rf3} %")


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best Random Forest Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Accuracy: 98.05 %
Precision: 100.0 %
Recall: 96.2 %
F1-Score: 98.06 %


In [47]:
#decision tree - grid search and pchf
dt_clf = DecisionTreeClassifier(random_state=42, class_weight='balanced')

params_dt = {
    'max_depth': [None, 5, 10, 15], # max depth of tree
    'min_samples_split': [2, 5, 10],  # min samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # min samples required at a leaf node
    'max_features': [None, 'sqrt', 'log2'], # num features considered for each split
    'criterion': ['gini', 'entropy']  # function to measure split quality
}


grid_search_dt = GridSearchCV(
    estimator=dt_clf,
    param_grid=params_dt,
    cv=5,
    scoring='accuracy', # optimization metric
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_dt.fit(X_train_transformed, Y_train) # train on transformed
best_dt = grid_search_dt.best_estimator_
Y_pred_dt = best_dt.predict(X_test_transformed) # test on transformed

accuracy_dt3 = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision_dt3 = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall_dt3 = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1_dt3 = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Accuracy: {accuracy_dt3} %")
print(f"Precision: {precision_dt3} %")
print(f"Recall: {recall_dt3} %")
print(f"F1-Score: {f1_dt3} %")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Selected Features (PCHF): ['cp', 'slope', 'exang', 'restecg', 'fbs', 'ca', 'trestbps', 'sex']
Transformation Matrix:
[[-0.47311377  0.42565129 -0.14181724  0.23857273]
 [-0.38417389 -0.09103366 -0.40786762 -0.4491913 ]
 [ 0.56321783 -0.24161543  0.09620066  0.12826466]
 [-0.24010265 -0.36585443  0.08991927 -0.31469551]
 [ 0.18753933  0.51875556 -0.32100403 -0.00580006]
 [ 0.3740242   0.06888011 -0.33747621 -0.64119103]
 [ 0.22623174  0.55164845  0.22368756 -0.16847317]
 [ 0.16572348 -0.20034611 -0.72747641  0.4316044 ]]
Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 98.05 %
Precision: 100.0 %
Recall: 96.2 %
F1-Score: 98.06 %


In [48]:
#Phase 3 results summary

# dictionary for easy formatting into table
results3 = {
    "Model": [
        "Logistic Regression",
        "Support Vector Machine",
        "K-Nearest Neighbors",
        "XGBoost",
        "Random Forest",
        "Decision Tree"
    ],
    "Accuracy (%)": [accuracy_lr3, accuracy_svm3, accuracy_knn3, accuracy_xgb3, accuracy_rf3, accuracy_dt3],
    "Precision":    [precision_lr3, precision_svm3, precision_knn3, precision_xgb3, precision_rf3, precision_dt3],
    "Recall":       [recall_lr3, recall_svm3, recall_knn3, recall_xgb3, recall_rf3, recall_dt3],
    "F1 Score":     [f1_lr3, f1_svm3, f1_knn3, f1_xgb3, f1_rf3, f1_dt3]
}

results3_df = pd.DataFrame(results3)
print(results3_df.to_string(index=False))

                 Model  Accuracy (%)  Precision  Recall  F1 Score
   Logistic Regression         77.27      76.19   81.01     78.53
Support Vector Machine         90.91      90.12   92.41     91.25
   K-Nearest Neighbors         98.05     100.00   96.20     98.06
               XGBoost         98.05     100.00   96.20     98.06
         Random Forest         98.05     100.00   96.20     98.06
         Decision Tree         98.05     100.00   96.20     98.06
