In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings(action = 'ignore')

In [2]:
ROOT_PATH = '/home/jovyan/TIL'
DATA_PATH = f'{ROOT_PATH}/dataset/heart_attack/heart.csv'

In [3]:
df = pd.read_csv(DATA_PATH)
df.tail()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [4]:
dict = {}
for idx in list(df.columns):
    dict[idx] = df[idx].value_counts().shape[0]
    
pd.DataFrame(dict, index=['unique count']).transpose()

Unnamed: 0,unique count
age,41
sex,2
cp,4
trtbps,49
chol,152
fbs,2
restecg,3
thalachh,91
exng,2
oldpeak,40


In [5]:
cat_cols   = ['sex', 'exng',   'caa',  'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols   = ['age', 'trtbps', 'chol', 'thalachh',  'oldpeak']
target_col = ['output']

In [6]:
df[con_cols].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
trtbps,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
thalachh,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [7]:
from sklearn.metrics import accuracy_score, classification_report, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import torch.nn as nn
import torch

In [8]:
df_cp = df
df_cp = pd.get_dummies(df_cp, columns = cat_cols, drop_first = True)

x     = df_cp.drop(['output'], axis = 1)
y     = df_cp[['output']]

scaler      = RobustScaler()
x[con_cols] = scaler.fit_transform(x[con_cols])
x.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_1,exng_1,caa_1,caa_2,caa_3,...,cp_2,cp_3,fbs_1,restecg_1,restecg_2,slp_1,slp_2,thall_1,thall_2,thall_3
0,0.592593,0.75,-0.110236,-0.092308,0.9375,1,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
1,-1.333333,0.0,0.15748,1.046154,1.6875,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
2,-1.037037,0.0,-0.566929,0.584615,0.375,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0.074074,-0.5,-0.062992,0.769231,0.0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
4,0.148148,-0.5,1.795276,0.307692,-0.125,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [9]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(f'train shape : x | {train_x.shape} / y | {train_y.shape}')
print(f'test  shape : x | {test_x.shape}  / y | {test_y.shape}')

train shape : x | (242, 22) / y | (242, 1)
test  shape : x | (61, 22)  / y | (61, 1)


In [10]:
clf    = SVC(kernel = 'linear', C = 1, random_state=42).fit(train_x, train_y)
pred_y = clf.predict(test_x)

f'the test accuracy score of SVM is {accuracy_score(test_y, pred_y):.2f}'

'the test accuracy score of SVM is 0.87'

In [11]:
svm    = SVC()
params = {'C' : np.arange(1, 10, 1), 'gamma' : [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 
                                                1e-2, 5e-2, 1e-1, 5e-1, 1.5e-1]}

In [12]:
searcher = GridSearchCV(svm, params)
searcher.fit(train_x, train_y)

pred_y   = searcher.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)

print(f'The best params are : {searcher.best_params_}')
print(f'The best score  is  : {searcher.best_score_:.2f}')
print(f'The test accuracy score of SVM after hyper-parameter tuning is : {accuracy:.2f}')

The best params are : {'C': 3, 'gamma': 0.1}
The best score  is  : 0.84
The test accuracy score of SVM after hyper-parameter tuning is : 0.90


In [16]:
logreg = LogisticRegression()
logreg.fit(train_x, train_y)

pred_y_proba = logreg.predict_proba(test_x)
pred_y       = np.argmax(pred_y_proba, axis = 1)

f'the test accuracy score of logistic regression is {accuracy_score(test_y, pred_y):.2f}'

'the test accuracy score of logistic regression is 0.90'