# **Gaussian Naive Bayes**

Import Libraries

In [1]:
from sklearn import datasets
import matplotlib.pyplot as plt

Generate some (random) data

In [None]:
X, Y = datasets.make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)

plt.scatter(X[:,0], X[:,1], c=Y, s=50, cmap='RdBu')
plt.show()

**Train a Naive Bayes Model**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()
model.fit(X, Y)

Let's check out the distribution of the classes

In [None]:
model.theta_    # mean of each feature per class (n_classes, n_features)

In [None]:
model.sigma_    # variance of each feature per class (n_classes, n_features)

In [None]:
plt.scatter(X[:,0], X[:,1], c=Y, s=50, cmap='RdBu')
plt.scatter(model.theta_[:,0], model.theta_[:,1], marker='d', c=['r', 'b'], s=200)
plt.show()

**Performance Evaluation**

In [None]:
from sklearn import metrics

In [None]:
pred = model.predict(X)
print(pred[:10])
print(Y[:10])

score = model.predict_proba(X)
print(score[:10,:])

Accuracy

In [None]:
acc = metrics.accuracy_score(Y, pred)
print('Accuracy : ', acc)

In [None]:
X2, Y2 = datasets.make_blobs(100, 2, centers=2, random_state=2, cluster_std=2.5)

pred2 = model.predict(X2)
print(pred2[:10])
print(Y2[:10])

score2 = model.predict_proba(X2)
print(score2[:10,:])

Accuracy

In [None]:
acc2 = metrics.accuracy_score(Y2, pred2)
print('Accuracy : ', acc2)

# **Breast Cancer Wisconsin (Diagnostic) Dataset**
*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)


Import Libraries

In [None]:
import numpy as np
from sklearn import datasets

Load dataset

In [None]:
wisconsin = datasets.load_breast_cancer()

In [None]:
wisconsin.keys()

In [None]:
wisconsin.data

In [None]:
wisconsin.data.shape

In [None]:
wisconsin.target_names

**Prepare Data**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, test_size=0.3, random_state=0)

In [None]:
print(TrainX.shape)
print(TrainY.shape)
print(TestX.shape)
print(TestY.shape)

**Train a Naive Bayes Model**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()
model.fit(TrainX, TrainY)

In [None]:
model.theta_

In [None]:
pred_train = model.predict(TrainX)
print(pred_train[:20])
print(TrainY[:20])

score_train = model.predict_proba(TrainX)
score_train[:10,:]

In [None]:
pred_test = model.predict(TestX)
print(pred_test[:20])
print(TestY[:20])

score_test = model.predict_proba(TestX)
score_test[:10,:]

**Performance Evaluation**

In [None]:
from sklearn import metrics

Accuracy

In [None]:
tr_acc = metrics.accuracy_score(TrainY, pred_train)
print('Training Accuracy : ', tr_acc)

ts_acc = metrics.accuracy_score(TestY, pred_test)
print('Test Accuracy : ', ts_acc)

ROC curve & AUC

In [None]:
tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, score_train[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, score_test[:,1], pos_label=1)

In [None]:
import matplotlib.pyplot as plt

plt.plot(tr_fpr, tr_tpr, color='b', label='Train')
plt.plot(ts_fpr, ts_tpr, color='r', label='Test')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend(loc='best')
plt.show()

In [None]:
tr_auc = metrics.roc_auc_score(TrainY, score_train[:,1])
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, score_test[:,1])
print('Test AUC : ', ts_auc)

# **Iris Plants Dataset**
*   150 instances (50 per each class)
*   4 numerical features (sepal length, sepal width, petal length, petal width)
*   3 class (setosa, versicolor, virginica)



In [None]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [None]:
iris.keys()

In [None]:
iris.target_names

**Prepare Data**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
TrainX, TestX, TrainY, TestY = train_test_split(iris.data, iris.target, test_size=0.3, random_state=0)

In [None]:
print(TrainX.shape)
print(TrainY.shape)
print(TestX.shape)
print(TestY.shape)

**Train a Naive Bayes Model**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()
model.fit(TrainX, TrainY)

In [None]:
model.theta_

In [None]:
pred_train = model.predict(TrainX)
print(pred_train)
print(TrainY)

In [None]:
pred_test = model.predict(TestX)
print(pred_test)
print(TestY)

**Performance Evaluation**

In [None]:
from sklearn import metrics

Accuracy

In [None]:
tr_acc = metrics.accuracy_score(TrainY, pred_train)
print('Training Accuracy : ', tr_acc)

ts_acc = metrics.accuracy_score(TestY, pred_test)
print('Test Accuracy : ', ts_acc)

Confusion Matrix

In [None]:
tr_cmat = metrics.confusion_matrix(TrainY, pred_train)
print(tr_cmat)

tr_cmat = metrics.confusion_matrix(TestY, pred_test)
print(tr_cmat)