In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
# Load data from https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset?resource=download&select=heart.csv
df = pd.read_csv('heart.csv')
df.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [3]:
# One hot encoding (for linear classifier)
df = pd.get_dummies(df, columns=['caa', 'cp', 'restecg'])

# Get targets
y_all = df['output'].to_numpy()

# Standardize the rest
df = (df - df.mean()) / df.std()

df.describe()

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,slp,thall,...,caa_2,caa_3,caa_4,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,...,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,7.035077000000001e-17,-4.690051e-17,-6.683323e-16,-8.207589000000001e-17,8.207589000000001e-17,-5.862564e-16,-2.3450260000000003e-17,-4.690051e-17,-7.035077000000001e-17,-1.87602e-16,...,0.0,9.380102e-17,7.035077000000001e-17,-7.035077000000001e-17,-5.862564e-17,-2.3450260000000003e-17,-5.276307e-17,4.690051e-17,1.055261e-16,-2.3450260000000003e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.793003,-1.465992,-2.145254,-2.320322,-0.4169448,-3.433587,-0.69548,-0.8953805,-2.270822,-3.778573,...,-0.378052,-0.2654018,-0.129318,-0.9438222,-0.44382,-0.6335996,-0.2861324,-0.9691222,-1.001649,-0.115472
25%,-0.7560295,-1.465992,-0.6627704,-0.6803688,-0.4169448,-0.7049444,-0.69548,-0.8953805,-0.6480412,-0.5120748,...,-0.378052,-0.2654018,-0.129318,-0.9438222,-0.44382,-0.6335996,-0.2861324,-0.9691222,-1.001649,-0.115472
50%,0.06977057,0.6798805,-0.09258463,-0.1208554,-0.4169448,0.1463921,-0.69548,-0.2063639,-0.6480412,-0.5120748,...,-0.378052,-0.2654018,-0.129318,-0.9438222,-0.44382,-0.6335996,-0.2861324,-0.9691222,0.995059,-0.115472
75%,0.7304107,0.6798805,0.4776012,0.5447726,-0.4169448,0.7139498,1.43311,0.4826527,0.9747397,1.121174,...,-0.378052,-0.2654018,-0.129318,1.056025,-0.44382,1.573075,-0.2861324,1.028456,0.995059,-0.115472
max,2.492118,0.6798805,3.898716,6.13026,2.390484,2.285648,1.43311,4.444498,0.9747397,1.121174,...,2.636412,3.755436,7.707354,1.056025,2.245729,1.573075,3.483351,1.028456,0.995059,8.631529


In [4]:
x_all = df.drop(columns=['output']).to_numpy()

# Split data to train and test
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=42)

In [5]:
# Train basic models with default parameters
models = [LogisticRegression(max_iter=5000), RandomForestClassifier(), SVC(), GaussianProcessClassifier(), KNeighborsClassifier()]

for model in models:
    model.fit(x_train, y_train)
    print(model.__class__.__name__, model.score(x_test, y_test))

LogisticRegression 0.8524590163934426
RandomForestClassifier 0.8688524590163934
SVC 0.819672131147541
GaussianProcessClassifier 0.819672131147541
KNeighborsClassifier 0.8360655737704918
