In [1]:
import numpy as np
import sklearn

In [2]:
X = np.array([[-2, -1], [-5, -1], [-1, -1], [1, 1], [3, 1], [5, 2]])
Y = np.array([1, 1, 1, 2, 2, 2]) #라벨링 제공 (1은 속도가 빨라짐, 2는 느려짐)

In [3]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [6]:
print(clf.predict([[1, 3]]))

[2]


## sklearn에서 기본 제공하는 위스콘신 유방암 진단 데이터를 활용하여 NB 테스트
- Binary classification dataset (M: 악성, B: 양성)
- 30개의 속성(feature)

In [8]:
from sklearn.datasets import load_breast_cancer
breast = load_breast_cancer() #데이터 셋 불러오기
dir(breast) #breast 데이터셋은 딕셔너리 형태로 넘어온다

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [9]:
print(breast.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [10]:
print(breast.target_names)

['malignant' 'benign']


In [11]:
print(breast.target) 

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 

### 각 딕셔너리 키 값의 의미는?
- breast.DESCR은 description
- breast.target_names는 각 라벨의 의미
- breast.target은 분류 라벨들을 의미
- breast.feature_names는 각 feature의 의미
- breast.data는 학습할 데이터를 의미


In [16]:
label_names = breast['target_names']
labels = breast['target']
feature_names = breast['feature_names']
features = breast['data']

In [18]:
print(label_names)

['malignant' 'benign']


In [19]:
from sklearn.model_selection import train_test_split
train, test, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1, random_state=90) 
#train 데이터와 test 데이터를 서로 나누는 과정, test 데이터는 train 데이터의 10%만 할당함

In [20]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [22]:
model = gnb.fit(train, train_labels) #학습시킴
preds = gnb.predict(test) #테스트 데이터를 학습시킨 결과에 집어 넣어 예상해봄
print(preds) #예상 결과

[1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1
 1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1]


In [24]:
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, preds)) #예측 정확성

0.9473684210526315
