## Breast_cancer

## 1. 모듈 임포트

In [1]:
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

## 2. 데이터 준비, 3. 데이터 이해하기

In [2]:
breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target

print("< Feature names >")
print(breast_cancer.feature_names)
print("\n< Target names >")
print(breast_cancer.target_names)
print("\n< Data shape >")
print(breast_cancer_data.shape)
print("\n< Label shape >")
print(breast_cancer_label.shape)

print(breast_cancer.DESCR)

< Feature names >
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

< Target names >
['malignant' 'benign']

< Data shape >
(569, 30)

< Label shape >
(569,)
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to 

## 4. train, test 데이터 분리

In [3]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, 
                                                    test_size = 0.5, random_state= 5)    ## test_size를 변경하며 시험.


## 5. 모델별 학습

In [4]:
### Decision tree
from sklearn.tree import DecisionTreeClassifier   ## 모델 불러오기

decision_tree = DecisionTreeClassifier(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.87      0.92      0.89        99
           1       0.96      0.92      0.94       186

    accuracy                           0.92       285
   macro avg       0.91      0.92      0.92       285
weighted avg       0.92      0.92      0.92       285



In [5]:
### Random Forest
from sklearn.ensemble import RandomForestClassifier   ## 모델 불러오기

decision_tree = RandomForestClassifier(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        99
           1       0.98      0.98      0.98       186

    accuracy                           0.97       285
   macro avg       0.97      0.97      0.97       285
weighted avg       0.97      0.97      0.97       285



In [6]:
### Support Vector Machine
from sklearn import svm   ## 모델 불러오기

decision_tree = svm.SVC(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))


------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.98      0.90      0.94        99
           1       0.95      0.99      0.97       186

    accuracy                           0.96       285
   macro avg       0.96      0.94      0.95       285
weighted avg       0.96      0.96      0.96       285



In [7]:
### Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier   ## 모델 불러오기

decision_tree = SGDClassifier(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))


------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.99      0.84      0.91        99
           1       0.92      0.99      0.96       186

    accuracy                           0.94       285
   macro avg       0.95      0.92      0.93       285
weighted avg       0.94      0.94      0.94       285



In [8]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression   ## 모델 불러오기

decision_tree = LogisticRegression(random_state = 32, max_iter = 5000)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        99
           1       0.98      0.97      0.98       186

    accuracy                           0.97       285
   macro avg       0.97      0.97      0.97       285
weighted avg       0.97      0.97      0.97       285



## 6. 평가 및 회고

#### DT: Decision Tree, RF: Random Forest, SVM: Support Vector Machine, SGD: Stochastic Gradient Descent, LR: Logistic Regression
### 모델별 예측정확도 결과는 아래와 같다.

#### Test size: 0.1일때
#### DT: 95%, RF: 96%, SVM: 96%, SGD: 89%, LR: 96%

#### Test size: 0.2일때
#### DT: 94%, RF: 97%, SVM: 94%, SGD: 61%, LR: 96%

#### Test size: 0.3일때
#### DT: 93%, RF: 98%, SVM: 96%, SGD: 90%, LR: 98%

#### Test size: 0.4일때
#### DT: 94%, RF: 97%, SVM: 96%, SGD: 95%, LR: 99%

#### Test size: 0.5일때
#### DT: 92%, RF: 97%, SVM: 96%, SGD: 94%, LR: 97%

### RF와 LR가 가장 높았고 Test size에 따라 1~2%씩 차이는 있었으나 무시할만한 수준으로 본다. 둘다 베스트 모델. 
### Test size 0.4에서 LR이 99%이긴 하나, 유의미한 차이로 보기에는 부족하지 않나 싶다. 인스턴스 수가 569인데, 이 수치가 유의미한지는 통계적 지식이 있어야 판별할 수 있을 것으로 생각된다.
### 이 데이터셋은 Digits와 같이 Test size에 영향을 받는 폭이 미미했다. 

### 3개 데이터셋을 종합하면,Label의 개수가 많을 경우, DT는 확률이 떨어지고 있다. 예외가 나올 확률이 높은 알고리즘의 특성이 잘 드러나고 있다.