<a href="https://colab.research.google.com/github/DavidJeonKr/lab_python/blob/master/ml09_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 이진 분류(Binary Class Classification)

* scikit-learn 패키지의 breast cancer 예제 데이터 셋을 로딩
* 악성종양(malignant)/양성종양(benign): 진짜 암/암이 아닌 종양을 분류하는 이진 분류 문제.
* 3가지 머신 러닝 분류 모델을 비교
    * KNN
    * Logistic Regression
    * SGD Classifier

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
breast_cancer = load_breast_cancer()

In [12]:
#print(breast_cancer.data[:5])
#print(breast_cancer.target_names)
#print(breast_cancer.feature_names)
print(breast_cancer['DESCR'])
print(breast_cancer.target_names)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [15]:
X = breast_cancer.data
X.shape

(569, 30)

In [18]:
y = breast_cancer.target
y.shape

(569,)

In [20]:
#훈련 셋 테이스 셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# 모델 생성 - KNN
* KNN

In [19]:
std_scaler = StandardScaler()
clf = KNeighborsClassifier()
knn_model = Pipeline(steps=[('std_scaler', std_scaler),('clf', clf)])

In [21]:
# 훈련 셋 평가
knn_model.fit(X_train,y_train)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('clf', KNeighborsClassifier())])

In [26]:
knn_train_pred = knn_model.predict(X_train)
knn_train_pred[:5]

array([0, 1, 0, 0, 0])

In [28]:
y_train[:5]

array([0, 1, 0, 0, 0])

In [29]:
confusion_matrix(y_train, knn_train_pred)

array([[149,  10],
       [  3, 264]])

In [None]:
accuracy_score(y_train, knn_train_pred)

In [31]:
# 테스트 셋 평가
knn_test_pred = knn_model.predict(X_test)
knn_test_pred[:5]

array([1, 0, 1, 1, 0])

In [32]:
y_test[:5]

array([1, 0, 1, 1, 0])

In [33]:
confusion_matrix(y_test,knn_test_pred)

array([[50,  3],
       [ 0, 90]])

In [34]:
print(classification_report(y_train, knn_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       159
           1       0.96      0.99      0.98       267

    accuracy                           0.97       426
   macro avg       0.97      0.96      0.97       426
weighted avg       0.97      0.97      0.97       426



# 모델 생성 - Logistic Regression
* Logistic Regression

In [35]:
std_scaler = StandardScaler()
clf = LogisticRegression()
log_model = Pipeline(steps=[('std_scaler', std_scaler),('clf', clf)])

In [36]:
# Model training
log_model.fit(X_train, y_train)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('clf', LogisticRegression())])

In [37]:
# training set model evaluation
log_train_pred = log_model.predict(X_train)

In [38]:
confusion_matrix(y_train, log_train_pred)

array([[155,   4],
       [  1, 266]])

In [39]:
# test set model evaluation
log_test_pred = log_model.predict(X_test)

In [40]:
confusion_matrix(y_test, log_test_pred)

array([[52,  1],
       [ 1, 89]])

In [41]:
print(classification_report(y_train, log_train_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       159
           1       0.99      1.00      0.99       267

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



#모델 생성 - SGD Classifier
* SGD Classifier

In [42]:
std_scaler = StandardScaler()
clf = SGDClassifier()
sgd_model = Pipeline(steps=[('std_scaler', std_scaler),('clf', clf)])

In [43]:
# Model training
sgd_model.fit(X_train,y_train)

Pipeline(steps=[('std_scaler', StandardScaler()), ('clf', SGDClassifier())])

In [46]:
sgd_train_pred = sgd_model.predict(X_train)

In [47]:
confusion_matrix(y_train, sgd_train_pred)

array([[153,   6],
       [  2, 265]])

In [48]:
# test set
sgd_test_pred = sgd_model.predict(X_test)

In [49]:
confusion_matrix(y_test,sgd_test_pred)

array([[51,  2],
       [ 1, 89]])

In [50]:
print(classification_report(y_train, sgd_train_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       159
           1       0.98      0.99      0.99       267

    accuracy                           0.98       426
   macro avg       0.98      0.98      0.98       426
weighted avg       0.98      0.98      0.98       426



# 다중 클래스 분류(Multi-class Classification)

* scikit-learn 패지의 iris 예제 데이터 셋을 로딩.
* setosa/versicolor/virginca: 3개 클래스를 갖는 분류 문제.
* 3가지 머신 러닝 분류 모델 비교

In [54]:
from sklearn.datasets import load_iris

In [60]:
iris = load_iris()

In [61]:
type(iris)

sklearn.utils.Bunch

In [68]:
iris.data[:5] # 굳이 표준화를 안해도 됨 숫자가 작어서

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [62]:
X = iris['data']
y = iris['target']
X.shape, y.shape

((150, 4), (150,))

In [64]:
# 훈련 셋 테스트 셋 분리
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [65]:
X_train.shape, y_train.shape

((112, 4), (112,))

In [67]:
X_test.shape, y_test.shape

((38, 4), (38,))

#KNN

In [70]:
model = KNeighborsClassifier()

In [71]:
model.fit(X_train,y_train)

KNeighborsClassifier()

In [72]:
train_pred = model.predict(X_train)

In [73]:
# 정확도
accuracy_score(y_train, train_pred)

0.9732142857142857

In [74]:
confusion_matrix(y_train, train_pred)

array([[38,  0,  0],
       [ 0, 35,  2],
       [ 0,  1, 36]])

In [75]:
# 테스트 셋 평가
test_pred = model.predict(X_test)

In [76]:
accuracy_score(y_test, test_pred)

0.9736842105263158

In [77]:
confusion_matrix(y_test, test_pred)

array([[12,  0,  0],
       [ 0, 13,  0],
       [ 0,  1, 12]])

# LogisticRegression

In [79]:
log_model = LogisticRegression()

In [80]:
log_model.fit(X_train, y_train)

LogisticRegression()

In [82]:
train_pred = log_model.predict(X_train)
train_pred

array([2, 2, 1, 1, 1, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 2, 0, 0, 1, 1, 1,
       0, 1, 2, 0, 2, 1, 2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 1, 1, 0, 2, 1, 2, 1, 2, 0, 1,
       0, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 2, 2, 0, 0, 2,
       2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0,
       1, 0])

In [84]:
test_pred = log_model.predict(X_test)
test_pred

array([0, 1, 1, 1, 0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 0, 0, 0, 1, 0, 1, 2, 1,
       2, 1, 2, 1, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 2, 1])

In [86]:
print('훈련 셋 정확도: ', accuracy_score(y_train, train_pred))
print('테스트셋 셋 정확도: ', accuracy_score(y_test, test_pred))

훈련 셋 정확도:  0.9732142857142857
테스트셋 셋 정확도:  0.9473684210526315


In [87]:
confusion_matrix(y_train, train_pred)

array([[38,  0,  0],
       [ 0, 35,  2],
       [ 0,  1, 36]])

In [88]:
confusion_matrix(y_test, test_pred)

array([[12,  0,  0],
       [ 0, 12,  1],
       [ 0,  1, 12]])

#SGDClassifier

In [89]:
sgd_model = SGDClassifier()

In [95]:
sgd_model.fit(X_train, y_train)

SGDClassifier()

In [96]:
train_pred = sgd_model.predict(X_train)
train_pred

array([2, 2, 1, 1, 1, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 2, 0, 2, 1, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 1, 1, 0, 2, 1, 2, 1, 2, 0, 1,
       0, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 2, 2, 0, 0, 2,
       2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0,
       1, 0])

In [97]:
y_train

array([2, 2, 1, 1, 1, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 2, 0, 0, 1, 1, 1,
       0, 1, 2, 0, 2, 1, 2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1, 1, 2, 1, 2, 0, 1,
       0, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 2, 2, 0, 0, 2,
       2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0, 0, 0, 0,
       1, 0])

In [98]:
test_pred = sgd_model.predict(X_test)
test_pred

array([0, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 1, 0, 1, 2, 1,
       2, 1, 2, 1, 1, 2, 0, 1, 2, 2, 0, 0, 0, 0, 2, 2])

In [99]:
print('훈련 셋 정확도: ', accuracy_score(y_train, train_pred))
print('테스트셋 셋 정확도: ', accuracy_score(y_test, test_pred))

훈련 셋 정확도:  0.9553571428571429
테스트셋 셋 정확도:  0.8947368421052632


In [100]:
confusion_matrix(y_train, train_pred)

array([[38,  0,  0],
       [ 0, 35,  2],
       [ 0,  3, 34]])

In [101]:
confusion_matrix(y_test, test_pred)

array([[11,  1,  0],
       [ 0, 10,  3],
       [ 0,  0, 13]])