In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split #훈련셋과 테스트셋을 분리
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt

In [2]:
# 1. 데이터 셋 생성
raw_data = pd.read_excel('./data/heart-disease.xlsx')
raw_data.head()

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,0,108,1,1.5,2,3,3,1
2,67,1,4,120,?,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   sex           303 non-null    int64  
 2   cp            303 non-null    int64  
 3   treshtbps     303 non-null    int64  
 4   chol          303 non-null    object 
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalach       303 non-null    int64  
 8   exang         303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slope         303 non-null    int64  
 11  ca            303 non-null    object 
 12  hsl           303 non-null    object 
 13  heartDisease  303 non-null    int64  
dtypes: float64(1), int64(10), object(3)
memory usage: 33.3+ KB


In [10]:
raw_data[(raw_data['chol']=='?')|(raw_data['ca']=='?')|(raw_data['hsl']=='?')]

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
2,67,1,4,120,?,0,2,129,1,2.6,2,2,7,1
87,53,0,3,128,216,0,2,115,0,0.0,1,0,?,0
166,52,1,3,138,223,0,0,169,0,0.0,1,?,3,0
192,43,1,4,132,247,1,2,143,1,0.1,2,?,7,1
266,52,1,4,128,204,1,0,156,1,1.0,2,0,?,1
287,58,1,2,125,220,0,0,144,0,0.4,2,?,7,0
302,38,1,3,138,175,0,0,173,0,0.0,1,?,3,0


In [11]:
clean_data = raw_data.replace('?', np.nan)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   sex           303 non-null    int64  
 2   cp            303 non-null    int64  
 3   treshtbps     303 non-null    int64  
 4   chol          302 non-null    float64
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalach       303 non-null    int64  
 8   exang         303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slope         303 non-null    int64  
 11  ca            299 non-null    float64
 12  hsl           301 non-null    float64
 13  heartDisease  303 non-null    int64  
dtypes: float64(4), int64(10)
memory usage: 33.3 KB


In [12]:
clean_data.dropna(how='any', inplace=True)
clean_data.isna().sum()

age             0
sex             0
cp              0
treshtbps       0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
ca              0
hsl             0
heartDisease    0
dtype: int64

In [16]:
# 독립변수(입력변수), 타겟변수(종속변수)
Input = clean_data.iloc[:,:-1]
Target = clean_data.iloc[:,[-1]] # 2차원 데이터
Target

Unnamed: 0,heartDisease
0,0
1,1
3,0
4,0
5,0
...,...
297,1
298,1
299,1
300,1


In [14]:
clean_data.shape

(296, 14)

In [17]:
Target['heartDisease'].sum()

136

In [18]:
Target['heartDisease'].mean()

0.4594594594594595

In [19]:
Target['heartDisease'].value_counts()

0    160
1    136
Name: heartDisease, dtype: int64

In [32]:
# Scale 조정
scaler = MinMaxScaler()
scaler.fit(Input)
scaled_input = scaler.transform(Input)
scaled_input

array([[0.70833333, 1.        , 0.        , ..., 1.        , 0.        ,
        0.75      ],
       [0.79166667, 1.        , 1.        , ..., 0.5       , 1.        ,
        0.        ],
       [0.16666667, 1.        , 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.8125    , 1.        , 1.        , ..., 0.5       , 0.66666667,
        1.        ],
       [0.58333333, 1.        , 1.        , ..., 0.5       , 0.33333333,
        1.        ],
       [0.58333333, 0.        , 0.33333333, ..., 0.5       , 0.33333333,
        0.        ]])

In [33]:
# scaled input, target -> 학습셋, 테스트셋 = 7:3
X_train, X_test, Y_train, Y_test = train_test_split(scaled_input, Target, test_size=0.3, random_state=5)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((207, 13), (207, 1), (89, 13), (89, 1))

In [34]:
# 2. 모델구성
model = Sequential()
model.add(Dense(units=500, input_dim=13, activation='tanh'))
model.add(Dropout(0.1))
model.add(Dense(units=200, activation='tanh'))
model.add(Dropout(0.1))
model.add(Dense(units=100, activation='tanh'))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 500)               7000      
_________________________________________________________________
dropout (Dropout)            (None, 500)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1

In [35]:
# 3. 학습과정 설정
# accuracy(정확도) : 예측결과와 실제값이 동일한 건수 / 전체건수
# recall(재현율) : 실제True인 것 중에 예측도 True인 건수
# precision(정밀도,민감도) : 예측 True인 것 중에 실제도 True인 건수
from tensorflow.keras import metrics
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', metrics.Recall(), metrics.Precision()])

In [36]:
# 4. 학습시키기
hist = model.fit(X_train, Y_train, epochs=50, batch_size=50, verbose=2, validation_split=0.2)

Epoch 1/50
4/4 - 1s - loss: 0.6890 - accuracy: 0.5273 - recall: 0.3382 - precision: 0.4107 - val_loss: 0.5035 - val_accuracy: 0.8333 - val_recall: 0.8571 - val_precision: 0.8182
Epoch 2/50
4/4 - 0s - loss: 0.4952 - accuracy: 0.7879 - recall: 0.8529 - precision: 0.6988 - val_loss: 0.4670 - val_accuracy: 0.8571 - val_recall: 0.7143 - val_precision: 1.0000
Epoch 3/50
4/4 - 0s - loss: 0.4122 - accuracy: 0.8121 - recall: 0.6912 - precision: 0.8246 - val_loss: 0.4803 - val_accuracy: 0.8571 - val_recall: 0.7143 - val_precision: 1.0000
Epoch 4/50
4/4 - 0s - loss: 0.3955 - accuracy: 0.8485 - recall: 0.7794 - precision: 0.8413 - val_loss: 0.3774 - val_accuracy: 0.8810 - val_recall: 0.7619 - val_precision: 1.0000
Epoch 5/50
4/4 - 0s - loss: 0.3720 - accuracy: 0.8485 - recall: 0.8235 - precision: 0.8116 - val_loss: 0.3505 - val_accuracy: 0.8810 - val_recall: 0.7619 - val_precision: 1.0000
Epoch 6/50
4/4 - 0s - loss: 0.3694 - accuracy: 0.8485 - recall: 0.7647 - precision: 0.8525 - val_loss: 0.3423 

Epoch 47/50
4/4 - 0s - loss: 0.3243 - accuracy: 0.8485 - recall: 0.8676 - precision: 0.7867 - val_loss: 0.3782 - val_accuracy: 0.8571 - val_recall: 0.8095 - val_precision: 0.8947
Epoch 48/50
4/4 - 0s - loss: 0.3092 - accuracy: 0.8485 - recall: 0.8088 - precision: 0.8209 - val_loss: 0.4347 - val_accuracy: 0.7857 - val_recall: 0.6667 - val_precision: 0.8750
Epoch 49/50
4/4 - 0s - loss: 0.3483 - accuracy: 0.8182 - recall: 0.7059 - precision: 0.8276 - val_loss: 0.4148 - val_accuracy: 0.7857 - val_recall: 0.6667 - val_precision: 0.8750
Epoch 50/50
4/4 - 0s - loss: 0.3039 - accuracy: 0.8545 - recall: 0.8382 - precision: 0.8143 - val_loss: 0.3678 - val_accuracy: 0.8333 - val_recall: 0.8095 - val_precision: 0.8500


In [37]:
hist.history.keys()

dict_keys(['loss', 'accuracy', 'recall', 'precision', 'val_loss', 'val_accuracy', 'val_recall', 'val_precision'])

In [38]:
#5. 평가
score = model.evaluate(X_test,Y_test)



In [44]:
# 6. confusion matrix(성능 평가 지표)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
pred = model.predict(X_test)
pred = (pred>0.5)
print('f1_score : ', f1_score(Y_test, pred))
pd.DataFrame(confusion_matrix(pred, Y_test), index=['실제 발생','실제 발생X'], columns=['발생 예측','발생 예측X'])

f1_score :  0.8695652173913044


Unnamed: 0,발생 예측,발생 예측X
실제 발생,37,7
실제 발생X,5,40
