# Implement the Naive Bayes Classifier

In [1]:
import numpy as np

class NaiveBayes:
  def fit(self, X, y):
    num_examples, num_features = X.shape
    self._classes = np.unique(y)
    num_classes = len(self._classes)
    # Initialize mean, variance and priors as zeroes
    self._means = np.zeros((num_classes, num_features))
    self._variances = np.zeros((num_classes, num_features))
    self._priors = np.zeros(num_classes)
    # Calculate the mean, variance and priors for each class and store them in the class
    for i, c in enumerate(self._classes):
      X_c = X[y==c] # All examples in X with class 'c'
      self._means[i, :] = X_c.mean(axis=0)
      self._variances[i, :] = X_c.var(axis=0)
      self._priors[i] = X_c.shape[0] / num_examples # prior probability, i.e., frequency of each class

  def predict(self, X):
    return np.array([self._predict(x) for x in X])

  def _predict(self, x):
    # Calculate posterior probability for each class
    posteriors = []
    for i in range(len(self._classes)):
      posteriors.append( np.log(self._priors[i]) + np.sum(np.log(self._pdf(i, x))) )
    # Return the index of the class with the highest posterior probability
    return self._classes[np.argmax(posteriors)]

  def _pdf(self, class_index, x):
    # Implement the Probability Density Function (PDF) of a Gaussian Distribution, P(x|c)
    # for each each unique class 'c' present in 'y'.
    mean = self._means[class_index]
    variance = self._variances[class_index]
    numerator = np.exp(-((x - mean) ** 2) / (2 * variance)) + 1e-9 # Avoid taking natural log of 0 warning in the _predict function
    denominator = np.sqrt(2 * np.pi * variance)
    return numerator / denominator

# Load in a dataset and train the model

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

model = NaiveBayes()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Test the model

In [3]:
def accuracy(y_test, y_pred):
  acc = np.sum(y_test == y_pred) / len(y_test)
  acc *= 100
  acc = round(acc, 2)
  return acc

def calculate_metrics(y_test, y_pred):
  true_positives = np.sum(np.logical_and(y_test == 1, y_pred == 1))
  false_positives = np.sum(np.logical_and(y_test == 0, y_pred == 1))
  false_negatives = np.sum(np.logical_and(y_test == 1, y_pred == 0))
  precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
  recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
  f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
  confusion_matrix = np.array([[true_positives, false_positives], [false_negatives, len(y_test) - true_positives]])
  return precision, recall, f1_score, confusion_matrix

def print_confusion_matrix(conf_matrix):
  true_positives, false_positives, false_negatives, true_negatives = conf_matrix.ravel()
  print(f"                    Actual Positive    | Actual Negative")
  print(f"Predicted Positive |       {true_positives} (TP)     |    {false_positives} (FP)")
  print(f"Predicted Negative |       {false_negatives} (FN)     |    {true_negatives} (TN)")

In [4]:
acc = accuracy(y_test, y_pred)
precision, recall, f1_score, confusion_matrix = calculate_metrics(y_test, y_pred)

print(f"Accuracy: {acc}%")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Confusion Matrix:")
print_confusion_matrix(confusion_matrix)

Accuracy: 88.6%
Precision: 0.8783783783783784
Recall: 0.9420289855072463
F1 Score: 0.9090909090909092
Confusion Matrix:
                    Actual Positive    | Actual Negative
Predicted Positive |       65 (TP)     |    9 (FP)
Predicted Negative |       4 (FN)     |    49 (TN)
