## KNN 연습문제

### 인도의 간질환자 예측 모델  
- 사용 데이터 세트 : 인도의 간질환자 데이터 (indian_liver_patient.csv)
- 데이터 세트 분리 : 테스트용(평가용) 데이터 세트 
- 사용하는 ML 알고리즘 : KNN 알고리즘  
- 모델 평가 : 예측 성능 평가   
    - 오차행렬, 정확도, 정밀도, 재현율, F1 스코어, ROC AUC 평가 지표 

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# 한글 문제
# matplotlib의 기본 폰트에서 한글이 지원되지 않기 때문에
# matplotlib의 폰트 변경 필요
import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS 
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

In [28]:
liver_df = pd.read_csv('../../data/indian_liver_patient.csv')
liver_df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [29]:
# 수행해야 될 작업 알아서 처리
# Dataset: 클래스 결정 값 (1또는 2)

In [30]:
liver_df['Dataset'].value_counts()

Dataset
1    416
2    167
Name: count, dtype: int64

In [31]:
liver_df.info()
# Albumin_and_Globulin_Ratio  4개의 null 값 존재

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [32]:
mean_ratio = liver_df['Albumin_and_Globulin_Ratio'].mean()
liver_df['Albumin_and_Globulin_Ratio'].fillna(mean_ratio, inplace=True)

In [33]:
liver_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  583 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [34]:
# 피처 데이터
feature_data = liver_df.drop('Dataset', axis=1)
feature_data.head(2)

# 레이블 데이터
labels = liver_df['Dataset']
labels.head(5)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74


0    1
1    1
2    1
3    1
4    1
Name: Dataset, dtype: int64

In [35]:
# 데이터 준비 / 분리

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_data,
                                                    labels,
                                                    random_state=11,
                                                    stratify=labels)

In [37]:
pd.Series(labels).value_counts()
pd.Series(labels).value_counts(normalize=True)

pd.Series(y_train).value_counts()
pd.Series(y_train).value_counts(normalize=True)

Dataset
1    416
2    167
Name: count, dtype: int64

Dataset
1    0.713551
2    0.286449
Name: proportion, dtype: float64

Dataset
1    312
2    125
Name: count, dtype: int64

Dataset
1    0.713959
2    0.286041
Name: proportion, dtype: float64

In [38]:
# 'Gender' 열을 이진 값으로 변환, 남자는 0, 여자는 1
X_train['Gender'] = X_train['Gender'].map({'Male': 0, 'Female': 1})

In [39]:
# 표준 스케일링 확인
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled[:10]

array([[ 1.30671525, -0.57294631, -0.40371351, -0.45107194, -0.09901035,
        -0.27487083, -0.26436913, -1.1526018 , -1.72660456, -1.40660394],
       [-0.11407607, -0.57294631, -0.07766878, -0.04517475, -0.29320551,
        -0.12101733,  0.21210503,  0.47540426,  0.0508423 , -0.50593755],
       [ 0.38011222, -0.57294631, -0.40371351, -0.45107194, -0.42407615,
        -0.36718293, -0.3082549 ,  0.47540426,  0.30476328, -0.18427098],
       [ 0.31833868, -0.57294631, -0.3548068 , -0.4141722 , -0.24676753,
        -0.18255873, -0.17032818,  1.7416312 ,  1.32044721,  0.13739558],
       [-1.16422618, -0.57294631, -0.42001575, -0.48797169, -0.44518432,
         0.03796461, -0.18600167, -0.88126745, -1.09180211, -0.82760412],
       [-0.11407607,  1.74536423, -0.38741127, -0.45107194, -0.44940596,
        -0.31589843, -0.28944671,  0.65629382,  0.93956573,  0.45906215],
       [-0.36117022, -0.57294631,  0.08535359,  0.28692296, -0.30587041,
         0.4892682 ,  0.37510934,  1.19896251

In [41]:
# 'Gender' 열을 이진 값으로 변환, 남자는 0, 여자는 1
X_test['Gender'] = X_test['Gender'].map({'Male': 0, 'Female': 1})

In [42]:
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
# 모델링

In [43]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier() # n_neighbors=5 기본값
kn_clf.fit(X_train_scaled, y_train)

In [44]:
y_pred = kn_clf.predict(X_test_scaled)
y_pred

array([1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1,
       1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1,
       1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1], dtype=int64)

In [45]:
# 예측 확률
kn_clf.predict_proba(X_test_scaled)

array([[0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4],
       [1. , 0. ],
       [1. , 0. ],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.6, 0.4],
       [0.6, 0.4],
       [0. , 1. ],
       [1. , 0. ],
       [0.2, 0.8],
       [0.8, 0.2],
       [0.6, 0.4],
       [0.6, 0.4],
       [1. , 0. ],
       [0.6, 0.4],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.6, 0.4],
       [0.4, 0.6],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.6, 0.4],
       [1. , 0. ],
       [0.6, 0.4],
       [0.6, 0.4],
       [0.8, 0.2],
       [1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [0.6, 0.4],
       [1. , 0. ],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.2, 0.8],
       [1. , 0. ],
       [0.6, 0.4],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.6, 0.4],
       [1. , 0. ],
       [0.8, 0.2],
       [0.8, 0.2],
       [0.8,

In [None]:
# 성능평가지표 출력

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

def get_clf_eval(y_test, y_pred, pred_proba) :
    cm = confusion_matrix(y_test, y_pred) # 혼동행렬(오차행렬)
    accuracy = accuracy_score(y_test, y_pred) # 정확도
    precision = precision_score(y_test, y_pred) # 정밀도
    recall = recall_score(y_test, y_pred)# 재현율
    f1 = f1_score(y_test, y_pred) # F1 스코어
    roc_auc = roc_auc_score(y_test, pred_proba) # ROC_AUC 스코어
    
    # 출력 
    # 오차행렬
    print("오차행렬 : ", cm)
    # 정확도, 정밀도, 재현율, F1
    print(f"정확도: {accuracy:.4f}, 정밀도: {precision:.4f},  재현율: {recall:.4f}, F1 Score: {f1:.4f}, ROC_AUC : {roc_auc:.4f}")

In [47]:
# 레이블 값이 1일 때의 예측 확률
pred_proba = kn_clf.predict_proba(X_test_scaled)[:, 1]

# 예측 모델 평가 수행 및 평가지표 출력하는 함수 호출
get_clf_eval(y_test, y_pred, pred_proba)

오차행렬 :  [[88 16]
 [26 16]]
정확도: 0.7123, 정밀도: 0.7719,  재현율: 0.8462, F1 Score: 0.8073, ROC_AUC : 0.6603
