## 피마 인디언 당뇨병 예측 사례
### 북아메리카 피마 지역 원주민의 Type-2 당뇨병 결과 데이터  
 캐글 사이트(https://www.kaggle.com/uciml/pima-indians-diabetes-database)  
 Feature 설명  
• Pregnancies: 임신 횟수  
• Glucose: 포도당 부하 검사 수치  
• BloodPressure: 혈압  
• SkinThickness: 팔 삼두근 뒤쪽의 피하지방 측정값  
• Insulin: 혈청 인슐린  
• BMI: 체질량 지수(체중/키)  
• DiabetesPedigreeFunction: 당뇨 내력 가중치 값  
• Age: 나이  
• Outcome: 클래스 결정 값 

https://blog.naver.com/mo223772/222032328170  
keras모듈 https://wikidocs.net/50137  
이건뭔가... https://blog.naver.com/thdgmltjd123/222159424025  



In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

diabates_data = pd.read_csv('../00.data/pima-indians/diabetes.csv')
print(diabates_data['Outcome'].value_counts())
diabates_data.head()

0    500
1    268
Name: Outcome, dtype: int64


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
diabates_data.info() #Nan값은없다

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [26]:
diabates_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [27]:
features = diabates_data.columns.difference(['Outcome'])
label = 'Outcome'

X = diabates_data[features]
y = diabates_data[label]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=121)
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

In [29]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
      dtype=int64)

In [36]:
pred_proba = lr_clf.predict_proba(X_test)[:,1]
pred_proba

array([0.18896236, 0.10636506, 0.2648182 , 0.18950529, 0.18649355,
       0.15250767, 0.30406002, 0.05591429, 0.3704629 , 0.34256567,
       0.85857179, 0.48903222, 0.07685414, 0.45773551, 0.34345204,
       0.16184385, 0.12099299, 0.13505228, 0.79620274, 0.1645629 ,
       0.14487432, 0.40992476, 0.97457654, 0.31389307, 0.12872308,
       0.56501402, 0.22109064, 0.114249  , 0.38437666, 0.18780912,
       0.67458142, 0.75694037, 0.5040456 , 0.13921277, 0.923118  ,
       0.05108994, 0.66976588, 0.15503958, 0.16357045, 0.71929744,
       0.96928898, 0.31022597, 0.52421618, 0.77614334, 0.11770105,
       0.82125499, 0.2018167 , 0.17808993, 0.1752301 , 0.08541268,
       0.56350925, 0.29074877, 0.33047468, 0.26907004, 0.21871199,
       0.12876731, 0.06663562, 0.61964848, 0.26454644, 0.1039207 ,
       0.10320541, 0.1074115 , 0.09560443, 0.4796795 , 0.03380149,
       0.7757294 , 0.62727547, 0.89854325, 0.87275523, 0.19209472,
       0.74842963, 0.21471106, 0.14865419, 0.64758664, 0.21106

In [37]:
pred[:12]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, F1스코어: {f1:.4f}')

In [41]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차행렬
[[93  7]
 [17 37]]
정확도: 0.8442, 정밀도: 0.8409, 재현율: 0.6852, F1스코어: 0.7551


In [44]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:12,:]

array([[0.81103764, 0.18896236],
       [0.89363494, 0.10636506],
       [0.7351818 , 0.2648182 ],
       [0.81049471, 0.18950529],
       [0.81350645, 0.18649355],
       [0.84749233, 0.15250767],
       [0.69593998, 0.30406002],
       [0.94408571, 0.05591429],
       [0.6295371 , 0.3704629 ],
       [0.65743433, 0.34256567],
       [0.14142821, 0.85857179],
       [0.51096778, 0.48903222]])

In [45]:
#Binarizer의 threshold 설정값. 분류 결정 임곗값임.  
custom_threshold = 0.5

# predict_proba( ) 반환값의 두번째 컬럼,
# 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold)
custom_predict = binarizer.fit_transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차행렬
[[93  7]
 [17 37]]
정확도: 0.8442, 정밀도: 0.8409, 재현율: 0.6852, F1스코어: 0.7551


In [46]:
# Binarizer의 threshold 설정값을 0.4로 설정.
custom_threshold = 0.4
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold)
custom_predict = binarizer.fit_transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차행렬
[[88 12]
 [15 39]]
정확도: 0.8247, 정밀도: 0.7647, 재현율: 0.7222, F1스코어: 0.7429


In [47]:
# Binarizer의 threshold 설정값을 0.7으로 설정.
custom_threshold = 0.7
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold)
custom_predict = binarizer.fit_transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차행렬
[[97  3]
 [34 20]]
정확도: 0.7597, 정밀도: 0.8696, 재현율: 0.3704, F1스코어: 0.5195


### F1 스코어

In [48]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, pred)
print(f'F1 스코어: {f1:.4f}')

F1 스코어: 0.7551


### ROC AUC 스코어

In [49]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, pred)
print(f'ROC AUC 스코어: {roc_auc:.4f}')

ROC AUC 스코어: 0.8076
