# Kaggle-heart disease dataset

## 2019-02-20

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
original=pd.read_csv('../input/heart.csv')
print(original.head())
print(original.shape)

   age  sex  cp  trestbps  chol   ...    oldpeak  slope  ca  thal  target
0   63    1   3       145   233   ...        2.3      0   0     1       1
1   37    1   2       130   250   ...        3.5      0   0     2       1
2   41    0   1       130   204   ...        1.4      2   0     2       1
3   56    1   1       120   236   ...        0.8      2   0     2       1
4   57    0   0       120   354   ...        0.6      2   0     2       1

[5 rows x 14 columns]
(303, 14)


In [3]:
np.isnan(original.any())

age         False
sex         False
cp          False
trestbps    False
chol        False
fbs         False
restecg     False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
target      False
dtype: bool

## Principal Component Analysis

In [4]:
print(original.columns.shape[0])
print(original.columns)

14
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [5]:
data=original.iloc[:,0:13]
target=original.loc[:,'target']
print(data.head())

   age  sex  cp  trestbps  chol  ...   exang  oldpeak  slope  ca  thal
0   63    1   3       145   233  ...       0      2.3      0   0     1
1   37    1   2       130   250  ...       0      3.5      0   0     2
2   41    0   1       130   204  ...       0      1.4      2   0     2
3   56    1   1       120   236  ...       0      0.8      2   0     2
4   57    0   0       120   354  ...       1      0.6      2   0     2

[5 rows x 13 columns]


In [6]:
pca=PCA(n_components=13).fit(data)

In [7]:
pvr=pd.DataFrame(pca.explained_variance_ratio_)
x=np.array(data.columns)
pvr['columns']=x
print(pvr)

           0   columns
0   0.747564       age
1   0.150370       sex
2   0.084597        cp
3   0.016216  trestbps
4   0.000384      chol
5   0.000281       fbs
6   0.000229   restecg
7   0.000100   thalach
8   0.000077     exang
9   0.000059   oldpeak
10  0.000050     slope
11  0.000041        ca
12  0.000031      thal


## PCA

With pvr,if I want to reduce data's dimension

I'll set n_components between two~three

but data size is small,so try not to reduce data's dimension

Then try to analysis correlation

## Correlation Analysis-Pearson coefficient

In [8]:
pearsonMatrix=pd.DataFrame(np.round(original.corr(method='pearson'),2))
pearsonMatrix.sort_values(by='target',ascending=False)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
target,-0.23,-0.28,0.43,-0.14,-0.09,-0.03,0.14,0.42,-0.44,-0.43,0.35,-0.39,-0.34,1.0
cp,-0.07,-0.05,1.0,0.05,-0.08,0.09,0.04,0.3,-0.39,-0.15,0.12,-0.18,-0.16,0.43
thalach,-0.4,-0.04,0.3,-0.05,-0.01,-0.01,0.04,1.0,-0.38,-0.34,0.39,-0.21,-0.1,0.42
slope,-0.17,-0.03,0.12,-0.12,-0.0,-0.06,0.09,0.39,-0.26,-0.58,1.0,-0.08,-0.1,0.35
restecg,-0.12,-0.06,0.04,-0.11,-0.15,-0.08,1.0,0.04,-0.07,-0.06,0.09,-0.07,-0.01,0.14
fbs,0.12,0.05,0.09,0.18,0.01,1.0,-0.08,-0.01,0.03,0.01,-0.06,0.14,-0.03,-0.03
chol,0.21,-0.2,-0.08,0.12,1.0,0.01,-0.15,-0.01,0.07,0.05,-0.0,0.07,0.1,-0.09
trestbps,0.28,-0.06,0.05,1.0,0.12,0.18,-0.11,-0.05,0.07,0.19,-0.12,0.1,0.06,-0.14
age,1.0,-0.1,-0.07,0.28,0.21,0.12,-0.12,-0.4,0.1,0.21,-0.17,0.28,0.07,-0.23
sex,-0.1,1.0,-0.05,-0.06,-0.2,0.05,-0.06,-0.04,0.14,0.1,-0.03,0.12,0.21,-0.28


## pearson coefficient

With this matrix,features:cp.thalach.slope and restecg have positive correlation

other features have negative correlation

So,Use four features to predict

## predict-using SVC

In [9]:
features=original.loc[:,['cp','thalach','slope','restecg']]
print(features.head())

   cp  thalach  slope  restecg
0   3      150      0        0
1   2      187      0        1
2   1      172      2        0
3   1      178      2        1
4   0      163      2        1


In [10]:
dataTrain,dataTest, \
targetTrain,targetTest = \
train_test_split(features,target,train_size=0.8)



In [11]:
heartSVC=SVC().fit(dataTrain,targetTrain)



In [12]:
heartSVC

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
pre=heartSVC.predict(dataTrain)

In [14]:
print(classification_report(targetTrain,pre))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       110
           1       0.83      0.90      0.87       132

   micro avg       0.85      0.85      0.85       242
   macro avg       0.85      0.84      0.84       242
weighted avg       0.85      0.85      0.85       242



In [15]:
predict=heartSVC.predict(dataTest)
print(classification_report(targetTest,predict))

              precision    recall  f1-score   support

           0       0.88      0.50      0.64        28
           1       0.69      0.94      0.79        33

   micro avg       0.74      0.74      0.74        61
   macro avg       0.78      0.72      0.72        61
weighted avg       0.77      0.74      0.72        61



## predict-using GBC

In [16]:
heartGBC=GBC(max_depth=2)

In [17]:
heartGBC.fit(dataTrain,targetTrain)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [18]:
Gpre=heartGBC.predict(dataTrain)
print(classification_report(targetTrain,Gpre))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       110
           1       0.86      0.89      0.88       132

   micro avg       0.86      0.86      0.86       242
   macro avg       0.86      0.86      0.86       242
weighted avg       0.86      0.86      0.86       242



In [19]:
Gpredict=heartGBC.predict(dataTest)
print(classification_report(targetTest,Gpredict))

              precision    recall  f1-score   support

           0       0.81      0.46      0.59        28
           1       0.67      0.91      0.77        33

   micro avg       0.70      0.70      0.70        61
   macro avg       0.74      0.69      0.68        61
weighted avg       0.73      0.70      0.69        61



## predict

With using GradientBoostingClassifier and SVC,

GradientBoostingClassifier predict test data's f1-score is better than SVC

So use GradientBoostingClassifier may be better than SVC

