In [None]:
# Gaussian Naive Bayes 기본 설명
# 임의의 데이터 만들기, 산점도 보기, 모델 학습, 정확도 및 확률 확인해보기

# Import Libraries
# Generate some (random) data
from sklearn import datasets
import matplotlib.pyplot as plt

X, Y = datasets.make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:,0], X[:,1], c=Y, s=50, cmap='RdBu')
plt.show()

# Train a Naive Bayes Model 
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X, Y)

model.theta_ # Mean
model.sigma_ # Variance

plt.scatter(X[:,0], X[:,1], c=Y, s=50, cmap='RdBu')
plt.scatter(model.theta_[:,0], model.theta_[:,1], marker='d', c=['r', 'b'], s=200)
plt.show()

# Performance Evaluation
from sklearn import metrics

pred = model.predict(X)
print(pred[:10])
print(Y[:10])

score = model.predict_proba(X)
print(score[:10,:])

# Accuracy
acc = metrics.accuracy_score(Y, pred)
print('Accuracy : ', acc)

# New data set (cluster_std 1.5 -> 2.5)
X2, Y2 = datasets.make_blobs(100, 2, centers=2, random_state=2, cluster_std=2.5)

pred2 = model.predict(X2)
print(pred2[:10])
print(Y2[:10])

score2 = model.predict_proba(X2)
print(score2[:10,:])

acc2 = metrics.accuracy_score(Y2, pred2)
print('Accuracy : ', acc2)

In [None]:
# Breast Cancer Wisconsin (Diagnostic) Dataset**
# 569 instances (212 Malignant(악성종양), 357 Benign(양성종양))
# 30 numerical features (computed from a digitized image of a breast mass)
# 2 classes (Malignant, Benign)

# Import Libraries
from sklearn import datasets
wisconsin = datasets.load_breast_cancer()

# 데이터 생김새 파악
wisconsin.keys()
wisconsin.data
wisconsin.data.shape
wisconsin.target_names

# 테스트 데이터 30% 분할
from sklearn.model_selection import train_test_split
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, test_size=0.3, random_state=0)
print(TrainX.shape)
print(TrainY.shape)
print(TestX.shape)
print(TestY.shape)

# Train data를 모델에 학습
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(TrainX,TrainY)
print(model.theta_)

# Train data로 예측결과, 확률 확인
pred_train = model.predict(TrainX)
print(pred_train[:20])
print(TrainY[:20])

score_train = model.predict_proba(TrainX)
score_train[:10,:]

# Test data로 학습결과 확인
pred_test = model.predict(TestX)
print(pred_test[:20])
print(TestY[:20])

score_test = model.predict_proba(TestX)
score_test[:10,:]

# 정확도
from sklearn import metrics
tr_acc = metrics.accuracy_score(TrainY, pred_train)
ts_acc = metrics.accuracy_score(TestY, pred_test)
print('tr Acc : ', tr_acc)
print('ts Acc : ', ts_acc)

# ROC & AUC analysis (proba 값 input)
tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, score_train[:, 1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, score_test[:, 1], pos_label=1)

import matplotlib.pyplot as plt

plt.plot(tr_fpr, tr_tpr, color='b', label='Train')
plt.plot(ts_fpr, ts_tpr, color='r', label='Test')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(loc='best')
plt.show()

# AUC로 정답률 확인
tr_auc = metrics.roc_auc_score(TrainY, score_train[:,1])
print('tr AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, score_test[:,1])
print('ts AUC : ', ts_auc)

In [None]:
# Iris Plants Dataset
# 150 instances (50 per each class)
# 4 numerical features (sepal length, sepal width, petal length, petal width)
# 3 class (setosa, versicolor, virginica)

# Import Libraries
from sklearn import datasets
iris = datasets.load_iris()

# 데이터 생김새 파악
iris.keys()
iris.data.shape
iris.target_names

# 테스트 데이터 분할
from sklearn.model_selection import train_test_split
TrainX, TestX, TrainY, TestY = train_test_split(iris.data, iris.target, test_size=0.3, random_state=0)
print(TrainX.shape)
print(TrainY.shape)
print(TestX.shape)
print(TestY.shape)

# Train data를 모델에 학습
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(TrainX, TrainY)
model.theta_

# Train, Test data로 학습결과 확인
pred_train = model.predict(TrainX)
# print(pred_train)
# print(TrainY)
pred_test = model.predict(TestX)
score_train = model.predict_proba(TrainX)
score_test = model.predict_proba(TestX)
# print(score_train)
# print(score_test)

# 정확도
from sklearn import metrics
tr_acc = metrics.accuracy_score(TrainY, pred_train)
print('tr ACC : ', tr_acc)

ts_acc = metrics.accuracy_score(TestY, pred_test)
print('ts ACC : ', ts_acc)

# Confusion Matrix
tr_cmat = metrics.confusion_matrix(TrainY, pred_train)
print(tr_cmat)

tr_cmat = metrics.confusion_matrix(TestY, pred_test)
print(tr_cmat)

# ROC corve & AUC
tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, score_train[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, score_test[:,1], pos_label=1)

import matplotlib.pyplot as plt
plt.plot(tr_fpr, tr_tpr, color='b', label='Train')
plt.plot(ts_fpr, ts_tpr, color='r', label='Test')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(loc='best')
plt.show()