In [65]:
from sklearn.datasets import load_breast_cancer
from sklearn.naive_bayes import GaussianNB
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [87]:
data = load_breast_cancer()
X = data["data"]
Y = data["target"]
list(data.target_names)

['malignant', 'benign']

In [126]:
%matplotlib qt
from sklearn.preprocessing import StandardScaler
from matplotlib import interactive
interactive(True)

sc = StandardScaler()
X_scaled = sc.fit_transform(X)

from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X_scaled)


fig = plt.figure(1, figsize=(12, 12))
ax = fig.add_subplot(111, projection="3d", elev=90, azim=0)

X_reduced = PCA(n_components=3).fit_transform(X)
ax.scatter(
    X_reduced[:, 0],
    X_reduced[:, 1],
    X_reduced[:, 2],
    c=y,
    cmap=plt.cm.Set1,
    edgecolor="k",
    s=20,
)

ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()

### Implement Naive Bayes Classifier from scratch

In [55]:
class NaiveBayes():
    def __init__(self, X, y):
        self.num_examples, self.num_features = X.shape
        self.num_classes = len(np.unique(y))
        self.eps = 1e-6

    def fit(self, X):
        self.classes_mean = {}
        self.classes_variance = {}
        self.classes_prior = {}

        for c in range(self.num_classes):
            X_c = X[y == c]

            self.classes_mean[str(c)] = np.mean(X_c, axis=0)
            self.classes_variance[str(c)] = np.var(X_c, axis=0)
            self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        probs = np.zeros((self.num_examples, self.num_classes))

        for c in range(self.num_classes):
            prior = self.classes_prior[str(c)]
            probs_c = self.density_function(X, self.classes_mean[str(c)], self.classes_variance[str(c)])
            probs[:, c] = probs_c + np.log(prior)

        return np.argmax(probs, 1)

    def density_function(self, x, mean, sigma):
        # Calculate probability from Gaussian density function
        const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(np.log(sigma + self.eps))
        probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
        return const - probs

### Training

In [58]:
NBC = NaiveBayes(X, Y)
NBC.fit(X)
preds = NBC.predict(X)

print(f"Accuracy: { sum( preds==Y ) / X.shape[0] }")

Accuracy: 0.9402460456942003


### Compare our implementation with sklearn 

In [19]:
NBC_sklearn = GaussianNB()
NBC_sklearn.fit(X, Y)
preds_sklearn = NBC_sklearn.predict(X)

print(f"Accuracy: { sum( preds_sklearn==Y ) / X.shape[0] }")

Accuracy: 0.9420035149384886
