In [129]:
import numpy as np
import pandas as pd
import mglearn

In [130]:
#Data load
from sklearn.datasets import load_breast_cancer
cancer_dataset = load_breast_cancer()

In [131]:
print("Keys of cancer_dataset:\n", cancer_dataset.keys())

Keys of cancer_dataset:
 dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [132]:
print("Target names:\n", cancer_dataset['target_names'])
print("Shape of target:", cancer_dataset['target'].shape)

Target names:
 ['malignant' 'benign']
Shape of target: (569,)


In [133]:
print("Feature naems:\n", cancer_dataset['feature_names'])
print("Shape of feature:", cancer_dataset['feature_names'].shape)

Feature naems:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Shape of feature: (30,)


설명 : feature의 수는 30개, target의 class수는 2개(malignant,benign) 인것을 알 수 있다.

In [134]:
from sklearn.model_selection import train_test_split

In [135]:
#Data split (Training / Validation / Test) 
x_train, x_test, y_train, y_test = train_test_split(
cancer_dataset.data,cancer_dataset.target, stratify=cancer_dataset.target, random_state=1)

x_train, x_val, y_train, y_val = train_test_split(
x_train, y_train, stratify=y_train, random_state=1)

In [136]:
print("x_train shape:",x_train.shape)
print("y_train shape:",y_train.shape)

print("x_val shape:",x_val.shape)
print("y_val shape:",y_val.shape)

print("x_test shape:",x_test.shape)
print("y_test shape:",y_test.shape)

x_train shape: (319, 30)
y_train shape: (319,)
x_val shape: (107, 30)
y_val shape: (107,)
x_test shape: (143, 30)
y_test shape: (143,)


설명: shape를 통해 train데이터,validation데이터 와 test데이터의 instance수는 각각 319,107,143개 인것을 알 수 있다.

In [137]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [138]:

md_setting = [1,3,5,10,20]
msl_setting = [1,2,5,10,20]

training_accuracy =[]
vali_accuracy = []
md_msl_settings =[]

#Hyperparameter 2개를 골라 해당 값들을 바꿔가며 validation set에 대한 성능 비교
for md in md_setting:
    for msl in msl_setting:
        #Training (DecisionTreeClassifier)
        clf1 = DecisionTreeClassifier(max_depth=md,min_samples_leaf=msl,random_state=0)
        clf1.fit(x_train, y_train)
        
        #validation
        y_train_hat = clf1.predict(x_train)
        y_val_hat = clf1.predict(x_val)
        
        #evaluation
        training_accuracy.append(accuracy_score(y_train,y_train_hat))
        vali_accuracy.append(accuracy_score(y_val, y_val_hat))
        
        md_msl_settings.append([md,msl])
        


In [139]:
result = pd.DataFrame({"max_depth,min_samples_leaf":md_msl_settings, "training accuracy":training_accuracy, "validation accuracy":vali_accuracy})

In [140]:
#train/val 비교 결과
result

Unnamed: 0,"max_depth,min_samples_leaf",training accuracy,validation accuracy
0,"[1, 1]",0.924765,0.915888
1,"[1, 2]",0.924765,0.915888
2,"[1, 5]",0.924765,0.915888
3,"[1, 10]",0.924765,0.915888
4,"[1, 20]",0.924765,0.915888
5,"[3, 1]",0.971787,0.943925
6,"[3, 2]",0.968652,0.943925
7,"[3, 5]",0.965517,0.953271
8,"[3, 10]",0.962382,0.943925
9,"[3, 20]",0.949843,0.925234


설명: max_depth가 3이고, min_samples_leaf가 5일때, validation accuracy가 약0.953으로 가장 높게 나온다. 
max_depth는 tree의 최대 높이를 의미하고
min_samples_leaf는 leaf노드의 최소 샘플 수를 의미한다.
tree의 최대 높이는 3이고 leaf노드의 최소 샘플 수를 5로 정했을 때, validation data에 대해 가장 높은 정확도를 보여준다. 

In [141]:
#최적의 hyperparameter 선정 후 해당 모델의 Test Set에 대한 성능 확인
clf2 = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=0)
clf2.fit(x_train, y_train)

y_test_hat = clf2.predict(x_test)

print("Test accuracy:",accuracy_score(y_test,y_test_hat))

Test accuracy: 0.9300699300699301


In [142]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_test_hat)

array([[49,  4],
       [ 6, 84]], dtype=int64)

설명: 최적의 hyperparameter를 사용하여 test set에 대한 성능 확인 결과 정확도는 약93%로 확인할 수 있다.


아래에서는 confusion matrix를 표현하였다.