In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_digits
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix

#실험에 사용된 다섯가지 모델을 순차적으로 학습시켜줄 함수를 담고 있다.
import os
os.chdir('..')

from modules.ml_big5_model import *

#logiticregression을 실행시켰을때 출력되는 warning 메세지를 없애준다.
import warnings
warnings.filterwarnings('ignore')

## 3. Breast_Cancer

### data 뜯어보기
어떠한 정보들을 담고 있는지 열어보자

In [2]:
breast_cancer = load_breast_cancer()
print(breast_cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
breast_cancer_data = breast_cancer.data
print("데이터의 갯수: ", breast_cancer_data.shape[0])
print("feature 수: ", breast_cancer_data.shape[1])

데이터의 갯수:  569
feature 수:  30


**학습에 사용될 데이터의 특징들**

In [4]:
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

**569개 데이터가 악성과 양성 두 가지로 분류되어 있다.**

In [5]:
breast_cancer_label = breast_cancer.target
print("label 수: ", breast_cancer_label.shape[0])
print("label 이름: ", breast_cancer.target_names)

label 수:  569
label 이름:  ['malignant' 'benign']


### data 그려보기
실제 데이터들이 feature별로 어떤 값을 가지는지 열어보자

In [6]:
cancer_df = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


**데이터를 Train과 Test set으로 나눠주자**  
```test_size``` = 0.1 : 데이터 셋의 10%를 test set으로 나눈다.

In [7]:
input_data = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.1, random_state = 1)
print("# of train_data: ", len(input_data[0]), "# of test_data: ", len(input_data[1]))

# of train_data:  512 # of test_data:  57


In [8]:
model = call_big5_model()

In [9]:
y_preds, accuracies, scores_frame  = ml_model(input_data, model)
scores_frame

5가지 model 학습완료


Unnamed: 0,accuracy
decision,0.912281
randomforest,0.982456
svm,0.929825
sgd,0.877193
logistic,1.0


In [10]:
print("[1st logistic report]")
print(classification_report(input_data[3], list(y_preds.values())[4]))
print("[5th sgd report]")
print(classification_report(input_data[3], list(y_preds.values())[3]))

[1st logistic report]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        34

    accuracy                           1.00        57
   macro avg       1.00      1.00      1.00        57
weighted avg       1.00      1.00      1.00        57

[5th sgd report]
              precision    recall  f1-score   support

           0       0.79      0.96      0.86        23
           1       0.97      0.82      0.89        34

    accuracy                           0.88        57
   macro avg       0.88      0.89      0.88        57
weighted avg       0.89      0.88      0.88        57



In [49]:
cm = []
for i in y_preds:
    print(i)
    print((confusion_matrix(input_data[3], y_preds[str(i)])))
    print("\n")
    cm.append((confusion_matrix(input_data[3], y_preds[str(i)])))

y_pred_DecisionTreeClassifier()
[[21  2]
 [ 1 33]]


y_pred_RandomForestClassifier()
[[21  2]
 [ 0 34]]


y_pred_SVC()
[[19  4]
 [ 0 34]]


y_pred_SGDClassifier()
[[22  1]
 [ 6 28]]


y_pred_LogisticRegression()
[[22  1]
 [ 0 34]]




### Breast Cancer 문제는 FN이 중요해!
유방암이 양성인데 음성으로 판단하는 경우,  
즉 FN error가 빈번한 경우  
이러한 모델은 사용하기 위험하다.

### recall을 구해보자  
recall이 높을 수록 FN이 낮다는 의미이며  
유방암을 예측하는 문제에서 정확도와 함께 고려해야하는 요소이다.

$$\frac{TP}{FN+TP}$$

In [51]:
for i in cm:
    print(i[0][0]/(i[0][1]+i[0][0]))

0.9130434782608695
0.9130434782608695
0.8260869565217391
0.9565217391304348
0.9565217391304348
