# Heart Disease Prediction - Group 14

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

import os
print(os.listdir())

import warnings
warnings.filterwarnings('ignore')

['.git', '.vscode', 'dataset-numerical.csv', 'heartprediction.ipynb', 'README.md', 'venv', '__pycache__']


In [3]:
dataset = pd.read_csv("dataset-numerical.csv")

In [4]:
#dataset eda
print(dataset.shape)
dataset.describe()

(1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
#total columns and count
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
#check correlation betweem columns
print(dataset.corr()["target"].abs().sort_values(ascending=False))

target      1.000000
oldpeak     0.438441
exang       0.438029
cp          0.434854
thalach     0.422895
ca          0.382085
slope       0.345512
thal        0.337838
sex         0.279501
age         0.229324
trestbps    0.138772
restecg     0.134468
chol        0.099966
fbs         0.041164
Name: target, dtype: float64


In [8]:
# train test split - 70% train 15% test 15% validation
predictors = dataset.drop("target", axis=1)
target = dataset["target"]

# first - 70% train 30% test
X_train, X_temp, Y_train, Y_temp = train_test_split(
    predictors, target,
    test_size=0.30,
    random_state=42,
    stratify=target
)

# second: val and test
X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp,
    test_size=0.50,      # half of 30% = 15%
    random_state=42,
    stratify=Y_temp
)

#for confirmation
print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


Train size: 717
Validation size: 154
Test size: 154


## Training/testing models without GridSearchCV - parent paper results

In [12]:
#Logistic Regression
lr = LogisticRegression(max_iter=1000)

# train and predict
lr.fit(X_train, Y_train)
Y_pred_lr = lr.predict(X_test)

# eval metrics
accuracy_lr = round(accuracy_score(Y_test, Y_pred_lr) * 100, 2)
precision_lr = round(precision_score(Y_test, Y_pred_lr) * 100, 2)
recall_lr = round(recall_score(Y_test, Y_pred_lr) * 100, 2)
f1_lr = round(f1_score(Y_test, Y_pred_lr) * 100, 2)

print("Logistic Regression Metrics:")
print("Accuracy:  ", accuracy_lr, "%")
print("Precision: ", precision_lr, "%")
print("Recall:    ", recall_lr, "%")
print("F1 Score:  ", f1_lr, "%")


Logistic Regression Metrics:
Accuracy:   80.52 %
Precision:  76.34 %
Recall:     89.87 %
F1 Score:   82.56 %


In [13]:
#SVM
sv = svm.SVC(kernel='linear')

# train and test
sv.fit(X_train, Y_train)
Y_pred_svm = sv.predict(X_test)

#eval metrics
accuracy_svm = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision_svm = round(precision_score(Y_test, Y_pred_svm) * 100, 2)
recall_svm = round(recall_score(Y_test, Y_pred_svm) * 100, 2)
f1_svm = round(f1_score(Y_test, Y_pred_svm) * 100, 2)

print("SVM metrics")
print("Accuracy:  ", accuracy_svm, "%")
print("Precision: ", precision_svm, "%")
print("Recall:    ", recall_svm, "%")
print("F1 Score:  ", f1_svm, "%")


SVM metrics
Accuracy:   85.06 %
Precision:  80.43 %
Recall:     93.67 %
F1 Score:   86.55 %


In [15]:
#KNN
knn = KNeighborsClassifier(n_neighbors=7)

#train and test
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)

#metrics
accuracy_knn = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision_knn = round(precision_score(Y_test, Y_pred_knn) * 100, 2)
recall_knn = round(recall_score(Y_test, Y_pred_knn) * 100, 2)
f1_knn = round(f1_score(Y_test, Y_pred_knn) * 100, 2)

print("KNN eval")
print("Accuracy:  ", accuracy_knn, "%")
print("Precision: ", precision_knn, "%")
print("Recall:    ", recall_knn, "%")
print("F1 Score:  ", f1_knn, "%")


KNN eval
Accuracy:   75.97 %
Precision:  76.25 %
Recall:     77.22 %
F1 Score:   76.73 %


In [16]:
#XGBoost
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

#train and test
xgb_model.fit(X_train, Y_train)
Y_pred_xgb = xgb_model.predict(X_test)

# eval metrics
accuracy_xgb = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision_xgb = round(precision_score(Y_test, Y_pred_xgb) * 100, 2)
recall_xgb = round(recall_score(Y_test, Y_pred_xgb) * 100, 2)
f1_xgb = round(f1_score(Y_test, Y_pred_xgb) * 100, 2)

print("XGBoost metrics:")
print("Accuracy:  ", accuracy_xgb, "%")
print("Precision: ", precision_xgb, "%")
print("Recall:    ", recall_xgb, "%")
print("F1 Score:  ", f1_xgb, "%")


XGBoost metrics:
Accuracy:   98.05 %
Precision:  100.0 %
Recall:     96.2 %
F1 Score:   98.06 %


In [17]:
# Random Forest
rf = RandomForestClassifier(random_state=42)

# Train and test
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

# eval metrics
accuracy_rf = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision_rf = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall_rf = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1_rf = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

print("Random Forest eval:")
print("Accuracy:  ", accuracy_rf, "%")
print("Precision: ", precision_rf, "%")
print("Recall:    ", recall_rf, "%")
print("F1 Score:  ", f1_rf, "%")


Random Forest eval:
Accuracy:   96.75 %
Precision:  100.0 %
Recall:     93.67 %
F1 Score:   96.73 %


In [18]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)

#train and test
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)

# Metrics
accuracy_dt = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision_dt = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall_dt = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1_dt = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

print("Decision Tree eval:")
print("Accuracy:  ", accuracy_dt, "%")
print("Precision: ", precision_dt, "%")
print("Recall:    ", recall_dt, "%")
print("F1 Score:  ", f1_dt, "%")


Decision Tree eval:
Accuracy:   98.05 %
Precision:  100.0 %
Recall:     96.2 %
F1 Score:   98.06 %


In [19]:
#Phase 1 results summary
results = {
    "Model": [
        "Logistic Regression",
        "Support Vector Machine",
        "K-Nearest Neighbors",
        "XGBoost",
        "Random Forest",
        "Decision Tree"
    ],
    "Accuracy (%)": [accuracy_lr, accuracy_svm, accuracy_knn, accuracy_xgb, accuracy_rf, accuracy_dt],
    "Precision":    [precision_lr, precision_svm, precision_knn, precision_xgb, precision_rf, precision_dt],
    "Recall":       [recall_lr, recall_svm, recall_knn, recall_xgb, recall_rf, recall_dt],
    "F1 Score":     [f1_lr, f1_svm, f1_knn, f1_xgb, f1_rf, f1_dt]
}

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))


                 Model  Accuracy (%)  Precision  Recall  F1 Score
   Logistic Regression         80.52      76.34   89.87     82.56
Support Vector Machine         85.06      80.43   93.67     86.55
   K-Nearest Neighbors         75.97      76.25   77.22     76.73
               XGBoost         98.05     100.00   96.20     98.06
         Random Forest         96.75     100.00   93.67     96.73
         Decision Tree         98.05     100.00   96.20     98.06


## Training/testing models with GridSearch CV - parent paper

In [22]:
# logistic regresson - grid search
lr = LogisticRegression(max_iter=1000)

params_lr = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
    'class_weight': [None, 'balanced']
}

grid_search_lr = GridSearchCV(
    estimator=lr,
    param_grid=params_lr,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_lr.fit(X_train, Y_train)
best_lr = grid_search_lr.best_estimator_
Y_pred_lr = best_lr.predict(X_test)

#eval metrics
accuracy_lr  = round(accuracy_score(Y_test, Y_pred_lr) * 100, 2)
precision_lr = round(precision_score(Y_test, Y_pred_lr)* 100, 2)
recall_lr = round(recall_score(Y_test, Y_pred_lr) * 100, 2)
f1_lr = round(f1_score(Y_test, Y_pred_lr) * 100, 2)

print("Logistic Regression eval:")
print("Accuracy: ", accuracy_lr)
print("Precision: ", precision_lr)
print("Recall: ", recall_lr)
print("F1 Score: ", f1_lr)
print("Best Params:", grid_search_lr.best_params_)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Logistic Regression eval:
Accuracy:  80.52
Precision:  76.34
Recall:  89.87
F1 Score:  82.56
Best Params: {'C': 10, 'class_weight': None, 'l1_ratio': 0, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
#svm - grid search
svm = svm.SVC(kernel='linear')

# smaller grid for faster search
params_svm = {
    'C': [0.1, 1, 10, 100],
    'class_weight': [None, 'balanced']
}

grid_search_svm = GridSearchCV(
    estimator=sv,
    param_grid=params_svm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# train and test
grid_search_svm.fit(X_train, Y_train)
best_svm = grid_search_svm.best_estimator_
Y_pred_svm = best_svm.predict(X_test)

# eval metrics
accuracy_svm  = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision_svm = precision_score(Y_test, Y_pred_svm)
recall_svm = recall_score(Y_test, Y_pred_svm)
f1_svm = f1_score(Y_test, Y_pred_svm)

print("Linear SVM eval results:")
print("Accuracy: ", accuracy_svm)
print("Precision: ", precision_svm)
print("Recall: ", recall_svm)
print("F1 Score: ", f1_svm)
print("Best Params:", grid_search_svm.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Linear SVM Performance (fast GridSearch):
 Accuracy:   85.71
 Precision:  0.9014084507042254
 Recall:     0.810126582278481
 F1 Score:   0.8533333333333334
Best Params: {'C': 10, 'class_weight': 'balanced'}


In [None]:
# knn - grid search
knn = KNeighborsClassifier()

params_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]
}

grid_search_knn = GridSearchCV(
    estimator=knn,
    param_grid=params_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_knn.fit(X_train, Y_train)
best_knn = grid_search_knn.best_estimator_
Y_pred_knn = best_knn.predict(X_test)

# eval metrics
accuracy = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_knn, average='binary') * 100, 2)
recall = round(recall_score(Y_test, Y_pred_knn, average='binary') * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_knn, average='binary') * 100, 2)

# results
print(f"Best KNN Parameters: {grid_search_knn.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
Accuracy: 99.35 %
Precision: 100.0 %
Recall: 98.73 %
F1-Score: 99.36 %


In [None]:
#xgboost - grid search
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

param_grid_fast = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3]
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid_fast,
    cv=5,               # 3-fold CV for speed
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# train and test
grid_search_xgb.fit(X_train, Y_train)
best_xgb = grid_search_xgb.best_estimator_
Y_pred_xgb = best_xgb.predict(X_test)

# eval metrics
accuracy = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)
recall = round(recall_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_xgb, average='binary') * 100, 2)

#  results
print(f"Best XGBoost Parameters: {grid_search_xgb.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best XGBoost Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1-Score: 100.0 %


In [None]:
#random forest - grid search
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

param_grid_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_rf.fit(X_train, Y_train)
best_rf = grid_search_rf.best_estimator_
Y_pred_rf = best_rf.predict(X_test)

# eval metrics
accuracy = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

# results
print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Best Random Forest Parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1-Score: 100.0 %


In [23]:
# decision tree -grid search
dt_clf = DecisionTreeClassifier(random_state=42)

param_grid_dt = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'max_features': [None, 'sqrt', 'log2'], 
    'criterion': ['gini', 'entropy']   
}

grid_search_dt = GridSearchCV(
    estimator=dt_clf,
    param_grid=param_grid_dt,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_dt.fit(X_train, Y_train)
best_dt = grid_search_dt.best_estimator_
Y_pred_dt = best_dt.predict(X_test)

# eval metrics
accuracy = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

# Print results
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 94.81 %
Precision: 96.1 %
Recall: 93.67 %
F1-Score: 94.87 %


In [None]:
from sklearn.preprocessing import StandardScaler

def pchf(X, n_features=8, n_components=4):
    # standardize features to determine variance on same scale
    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    
    # calculate variances of each feature and sort to find the top
    feature_variances = X_scaled.var().sort_values(ascending=False)
    selected_features = feature_variances.head(n_features).index.tolist()
    X_top = X_scaled[selected_features].values 
    
    # find covariance matrix and eigen decomposition to find how features relate to each other
    cov_matrix = np.cov(X_top, rowvar=False)  # shape: (n_features, n_features)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    # selects most informative and influential components
    sorted_idx = np.argsort(eigenvalues)[::-1]
    top_eigenvectors = eigenvectors[:, sorted_idx[:n_components]] 
    
    # completes final linear transformation
    X_transformed = np.dot(X_top, top_eigenvectors)
    
    return X_transformed, selected_features, top_eigenvectors


## Training models with grid search and pchf feature selection

In [None]:
#logistic regression - grid search and pchf

#apply pchf to data
X_train_transformed, selected_features, transform_matrix = pchf(
    X_train, n_features=8, n_components=4
)

#scale again
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix)

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix)

#lr
logreg = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'   # handles imbalance (replaces SMOTE)
)

params_logreg = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']   # supports both l1/l2
}

grid_search_logreg = GridSearchCV(
    estimator=logreg,
    param_grid=params_logreg,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_logreg.fit(X_train_transformed, Y_train)
best_logreg = grid_search_logreg.best_estimator_
Y_pred_logreg = best_logreg.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_logreg) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_logreg) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_logreg) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_logreg) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Logistic Regression Parameters: {grid_search_logreg.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best Logistic Regression Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 77.27 %
Precision: 75.58 %
Recall: 82.28 %
F1-Score: 78.79 %
Confusion Matrix:
 [[54 21]
 [14 65]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.72      0.76        75
           1       0.76      0.82      0.79        79

    accuracy                

In [None]:
#svm - grid search and pchf

X_train_transformed, selected_features, transform_matrix = pchf(X_train, n_features=8, n_components=4)
X_test_transformed = np.dot(StandardScaler().fit_transform(X_test[selected_features]), transform_matrix)

# svm
svm_model = svm.SVC(random_state=42, class_weight='balanced', probability=True)

params_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=params_svm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_svm.fit(X_train_transformed, Y_train)
best_svm = grid_search_svm.best_estimator_
Y_pred_svm = best_svm.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_svm) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_svm) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_svm) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_svm) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best SVM Parameters: {grid_search_svm.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best SVM Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy: 87.66 %
Precision: 88.46 %
Recall: 87.34 %
F1-Score: 87.9 %
Confusion Matrix:
 [[66  9]
 [10 69]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.87        75
           1       0.88      0.87      0.88        79

    accuracy                           0.88      

In [None]:
#knn - grid search and pchf
X_train_transformed, selected_features, transform_matrix = pchf(X_train, n_features=8, n_components=4)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix)

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix)

knn_clf = KNeighborsClassifier()

params_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]  # p=1 (Manhattan), p=2 (Euclidean)
}

grid_search_knn = GridSearchCV(
    estimator=knn_clf,
    param_grid=params_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# train and test
grid_search_knn.fit(X_train_transformed, Y_train)
best_knn = grid_search_knn.best_estimator_
Y_pred_knn = best_knn.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_knn) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_knn) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_knn) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_knn) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best KNN Parameters: {grid_search_knn.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 21, 'p': 1, 'weights': 'distance'}
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1-Score: 100.0 %
Confusion Matrix:
 [[75  0]
 [ 0 79]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      1.00      1.00        79

    accuracy       

In [None]:
#xgboost - grid search and pchf
X_train_transformed, selected_features, transform_matrix = pchf(X_train, n_features=8, n_components=4)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix)

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix)

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

params_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3]
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=params_xgb,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_xgb.fit(X_train_transformed, Y_train)
best_xgb = grid_search_xgb.best_estimator_
Y_pred_xgb = best_xgb.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_xgb) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_xgb) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_xgb) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_xgb) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best XGBoost Parameters: {grid_search_xgb.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")



Fitting 5 folds for each of 128 candidates, totalling 640 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best XGBoost Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 98.05 %
Precision: 100.0 %
Recall: 96.2 %
F1-Score: 98.06 %
Confusion Matrix:
 [[75  0]
 [ 3 76]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        75
    

In [None]:
#random forest - grid search and pchf
X_train_transformed, selected_features, transform_matrix = pchf(X_train, n_features=8, n_components=4)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix)

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix)

rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')

params_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_rf.fit(X_train_transformed, Y_train)
best_rf = grid_search_rf.best_estimator_
Y_pred_rf = best_rf.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_rf) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_rf) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_rf) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_rf) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best Random Forest Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1-Score: 100.0 %
Confusion Matrix:
 [[75  0]
 [ 0 79]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00 

In [None]:
#decision tree - grid search and pchf
X_train_transformed, selected_features, transform_matrix = pchf(X_train, n_features=8, n_components=4)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_train_transformed = np.dot(X_train_scaled, transform_matrix)

X_test_scaled = scaler.transform(X_test[selected_features])
X_test_transformed = np.dot(X_test_scaled, transform_matrix)

dt_clf = DecisionTreeClassifier(random_state=42, class_weight='balanced')

params_dt = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(
    estimator=dt_clf,
    param_grid=params_dt,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

#train and test
grid_search_dt.fit(X_train_transformed, Y_train)
best_dt = grid_search_dt.best_estimator_
Y_pred_dt = best_dt.predict(X_test_transformed)

accuracy = round(accuracy_score(Y_test, Y_pred_dt) * 100, 2)
precision = round(precision_score(Y_test, Y_pred_dt) * 100, 2)
recall = round(recall_score(Y_test, Y_pred_dt) * 100, 2)
f1 = round(f1_score(Y_test, Y_pred_dt) * 100, 2)

print(f"Selected Features (PCHF): {selected_features}")
print(f"Transformation Matrix:\n{transform_matrix}")
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Accuracy: {accuracy} %")
print(f"Precision: {precision} %")
print(f"Recall: {recall} %")
print(f"F1-Score: {f1} %")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Selected Features (PCHF): ['age', 'sex', 'cp', 'exang', 'trestbps', 'chol', 'fbs', 'restecg']
Transformation Matrix:
[[-0.52551453 -0.10975553  0.08932326  0.1176242 ]
 [ 0.13309368  0.21390126 -0.69532703 -0.19115739]
 [ 0.24581068 -0.63826144 -0.08447766 -0.04216228]
 [-0.30362653  0.58295259 -0.2008337   0.19145052]
 [-0.44851092 -0.24520462 -0.14946319  0.22930878]
 [-0.45741475 -0.04924953  0.35090545 -0.24018757]
 [-0.19617993 -0.32821366 -0.46657595  0.48354396]
 [ 0.32346826  0.15720548  0.3128962   0.75306381]]
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1-Score: 100.0 %
Confusion Matrix:
 [[75  0]
 [ 0 79]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00   