In [15]:
import pandas as pd
import numpy as np

# Import warnings
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Upload heart.csv to dataFrame
df_heart = pd.read_csv('heart_disease.csv')

# Show first five rows
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


- 'target' is binary, with 1 indicating that the patient has heart disease and 0 indicating that they do not. 
<br>
- age: Age in years <br>
- sex: Sex (1 = male; 0 = female) 
- cp: Chest pain type (1 = typical angina, 2 = atypical angina, 3 = non-anginal pain, 4 = asymptomatic) 
- trestbps: Resting blood pressure (in mm Hg on admission to the hospital) 
- chol: Serum cholesterol in mg/dl 6 
- fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg: Resting electrocardiographic results (0 = normal, 1 = having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria) 
- thalach: Maximum heart rate achieved 
- exang: Exercise induced angina (1 = yes; 0 = no) 
- oldpeak: ST depression induced by exercise relative to rest 
- slope: The slope of the peak exercise ST segment (1 = upsloping, 2 = flat, 3 = downsloping) 
- ca: Number of major vessels (0-3) colored by fluoroscopy 
- thal: 3 = normal; 6 = fixed defect; 7 = reversible defect<br><br>
<p>
refer to "https://archive.ics.uci.edu/ml/datasets/Heart+Disease"</p>

In [17]:
# split data into X and y
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

In [18]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [19]:
# Import Decision Tree Regressor
from sklearn.tree import DecisionTreeClassifier

# Import cross_val_score
from sklearn.model_selection import cross_val_score
# Decision Tree는 보통 Grid search를 쓰면 k-fold cross validation을 해줌.
# 그래서 cross_val_score를 굳이 쓰지는 않아도 됨.

In [20]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier()

# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Display accuracy
print('Accuracy:', np.round(scores, 2))

# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.75 0.84 0.74 0.68 0.7 ]
Accuracy mean: 0.74


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def grid_search_clf(params, runs=20, clf=DecisionTreeClassifier()):

    # Instantiate GridSearchCV as grid_reg
    # 학습 데이터를 5개의 fold를 찾음.
    # grid search로 구해진 값을 사용하여 다시한번 재 학습 시키는것이 best
    # 더 많은 학습데이터를 사용하는것이 더욱 안정적인 모델을 만드는 방법(데이터가 많기 때문)
    grid_clf = GridSearchCV(clf, params, cv=5, n_jobs=-1)
    
    # Fit grid_reg on X_train and y_train
    grid_clf.fit(X_train, y_train)
    
    print(grid_clf.best_params_)

    # Extract best estimator
    best_model = grid_clf.best_estimator_
    
    # Extract best score
    best_score = grid_clf.best_score_

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = best_model.predict(X_test)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print accuracy
    print('Test score: {:.3f}'.format(accuracy))
        
    # Return best model
    return best_model

In [22]:
grid_search_clf(params={'criterion':['entropy', 'gini'],
                          'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
                          'min_samples_leaf':[0.01, 0.02, 0.03, 0.04, 0.05],
                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50],
                          'max_depth':[2,4,6,8],
                         })

{'criterion': 'entropy', 'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_leaf': 0.01, 'min_samples_split': 4}
Training score: 0.777
Test score: 0.770


DecisionTreeClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=20,
                       min_samples_leaf=0.01, min_samples_split=4)

In [26]:
best_clf = DecisionTreeClassifier(criterion='entropy', max_depth=6,
                       max_leaf_nodes=20,
                       min_samples_leaf=0.01, min_samples_split=4)
best_clf.fit(X, y)

DecisionTreeClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=20,
                       min_samples_leaf=0.01, min_samples_split=4)

# Feature importance
각각의 값이 종속변수를 설명하는데 얼마나 중요한 역할을 하는가?를 확인

In [27]:
best_clf.feature_importances_

array([0.12486434, 0.05933141, 0.29385999, 0.        , 0.03135179,
       0.        , 0.02499657, 0.06197747, 0.02929404, 0.09984918,
       0.        , 0.16182631, 0.11264889])

In [28]:
# Zip columns and feature_importances_ into dict
# 어떤 것이 가장 큰 영향을 끼치는 값인지 순서대로 정렬(바 그래프로 표현하기도 함)
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

[(k,v) for k, v in sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)]

[('cp', 0.29385999078492864),
 ('ca', 0.16182631123574934),
 ('age', 0.12486434443189202),
 ('thal', 0.11264888654392642),
 ('oldpeak', 0.09984918283812874),
 ('thalach', 0.061977474008622344),
 ('sex', 0.05933140950171351),
 ('chol', 0.03135179245349033),
 ('exang', 0.02929403714956004),
 ('restecg', 0.024996571051988607),
 ('trestbps', 0.0),
 ('fbs', 0.0),
 ('slope', 0.0)]