In [1]:
import sklearn
print(sklearn.__version__)

1.0


In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# call and read digits dataset

breast_cancer = load_breast_cancer()

print(dir(breast_cancer))
print(breast_cancer.target_names)
print(breast_cancer.DESCR)

# assign data for later use

breast_cancer_feature = breast_cancer.data
breast_cancer_label = breast_cancer.target

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']
['malignant' 'benign']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest value

In [3]:
from sklearn.model_selection import train_test_split

# split data for model training and model testing

data_train, data_test, label_train, label_test = train_test_split(breast_cancer_feature, breast_cancer_label, test_size = 0.2, random_state=7)
                                                     
print('data_train 개수: ', len(data_train),', data_test 개수: ', len(data_test))
print(data_train.shape, label_train.shape)
print(data_test.shape, label_test.shape)

data_train 개수:  455 , data_test 개수:  114
(455, 30) (455,)
(114, 30) (114,)


In [4]:
# import models

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier  
from sklearn import svm  
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression 

# import metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

---
## Trying Decision Tree model


In [5]:
decision_tree = DecisionTreeClassifier(random_state=32) 
decision_tree.fit(data_train, label_train)
label_prediction = decision_tree.predict(data_test)

print(confusion_matrix(label_test, label_prediction))
print(classification_report(label_test, label_prediction))

[[33  7]
 [ 3 71]]
              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



---
## Trying Randome Forest model

In [6]:
# trying Random Forest model

random_forest = RandomForestClassifier(random_state=32) 
random_forest.fit(data_train, label_train)
label_prediction = random_forest.predict(data_test)

print(confusion_matrix(label_test, label_prediction))
print(classification_report(label_test, label_prediction))

[[40  0]
 [ 0 74]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



---
## Trying Support Vector Machine

In [7]:
# trying Support Vector Machine

svm_model = svm.SVC()
svm_model.fit(data_train, label_train)
label_prediction = svm_model.predict(data_test)

print(confusion_matrix(label_test, label_prediction))
print(classification_report(label_test, label_prediction))

[[29 11]
 [ 0 74]]
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



---
## Trying Stochastic Gradient Descent model

In [8]:
sgd_model = SGDClassifier()
sgd_model.fit(data_train, label_train)
label_prediction = sgd_model.predict(data_test)

print(confusion_matrix(label_test, label_prediction))
print(classification_report(label_test, label_prediction))

[[29 11]
 [ 2 72]]
              precision    recall  f1-score   support

           0       0.94      0.72      0.82        40
           1       0.87      0.97      0.92        74

    accuracy                           0.89       114
   macro avg       0.90      0.85      0.87       114
weighted avg       0.89      0.89      0.88       114



---
## Trying Logistic Regression

In [9]:
logistic_model = LogisticRegression(max_iter=2000) # 최대 반복횟수 증가
logistic_model.fit(data_train, label_train)
label_prediction = logistic_model.predict(data_test)

print(confusion_matrix(label_test, label_prediction))
print(classification_report(label_test, label_prediction))

[[34  6]
 [ 0 74]]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



## 모델을 평가해 보기
유방암 판정의 경우, 암이 있는 환자를 판정하는 정밀도가 매우 중요하다. 다른 요소들 역시 높을 수록 좋은 것은 당연하나, 해당 정밀도에 비하면 부차적인 요소로 고려할 수 있다. 해당 평가기준에 따라 5가지 모델을 비교하면 다음과 같다.

|모델명|Precision of Positive|순위|
|---|---|---|
|의사결정나무|0.91|3|
|랜덤포레스트|1.0|1|
|SVM|0.87|4|
|SGD|0.87|5|
|로지스틱|0.95|2|

SVM과 SGD의 경우 주요 기준인 암환자의 정밀도 점수가 동일하나, 다른 결과치를 부차적으로 활용하여 순위를 정할 수 있다.