# Gaussian Naive Bayes

https://www.youtube.com/watch?v=H3EjCKtlVog

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [2]:
data_raw = pd.read_csv('./data/Breast_cancer_data.csv')
data_raw.head(10)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0
5,12.45,15.7,82.57,477.1,0.1278,0
6,18.25,19.98,119.6,1040.0,0.09463,0
7,13.71,20.83,90.2,577.9,0.1189,0
8,13.0,21.82,87.5,519.8,0.1273,0
9,12.46,24.04,83.97,475.9,0.1186,0


# Correlations

Naive Bayes algorithms perform well on datasets with no correlations in independent features. Removing the correlated features may improve the performance of the algorithm.

Source: https://www.analyticsvidhya.com/blog/2023/01/naive-bayes-algorithms-a-complete-guide-for-beginners/

In [3]:
data_raw.corr()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
mean_radius,1.0,0.323782,0.997855,0.987357,0.170581,-0.730029
mean_texture,0.323782,1.0,0.329533,0.321086,-0.023389,-0.415185
mean_perimeter,0.997855,0.329533,1.0,0.986507,0.207278,-0.742636
mean_area,0.987357,0.321086,0.986507,1.0,0.177028,-0.708984
mean_smoothness,0.170581,-0.023389,0.207278,0.177028,1.0,-0.35856
diagnosis,-0.730029,-0.415185,-0.742636,-0.708984,-0.35856,1.0


In [4]:
data = data_raw[['mean_radius', 'mean_texture', 'mean_smoothness', 'diagnosis']]
data.head(5)

Unnamed: 0,mean_radius,mean_texture,mean_smoothness,diagnosis
0,17.99,10.38,0.1184,0
1,20.57,17.77,0.08474,0
2,19.69,21.25,0.1096,0
3,11.42,20.38,0.1425,0
4,20.29,14.34,0.1003,0


In [5]:
data.corr()

Unnamed: 0,mean_radius,mean_texture,mean_smoothness,diagnosis
mean_radius,1.0,0.323782,0.170581,-0.730029
mean_texture,0.323782,1.0,-0.023389,-0.415185
mean_smoothness,0.170581,-0.023389,1.0,-0.35856
diagnosis,-0.730029,-0.415185,-0.35856,1.0


# Algorithm

In [6]:
train, test = train_test_split(data, test_size=0.2)

In [7]:
features = ['mean_radius', 'mean_texture', 'mean_smoothness']
label_column = 'diagnosis'
labels = train[label_column].unique()

In [8]:
train_by_label = { label: train[train[label_column] == label] for label in labels }

assert(train_by_label[0].shape[0] + train_by_label[1].shape[0] == train.shape[0])

In [9]:
total = train.shape[0]

prior = { label: train_by_label[label].shape[0] / total for label in labels }

assert(prior[0] + prior[1] == 1)

In [10]:
dists = { label: {} for label in labels }

for label in labels:
  for f in features:
    col = train_by_label[label][f]
    dists[label][f] = stats.norm(col.mean(), col.std())

Avoiding underflow (when probabilities are too small) can be done with this trick:

https://en.wikipedia.org/wiki/Log_probability

In [11]:
def classify(input):
  p = dict(prior)

  for label in labels:
    p[label] = np.log(p[label])

    for f in features:
      p[label] += np.log(dists[label][f].pdf(input[f]))

  return max(p, key=lambda k: p[k])

# Test

In [12]:
from sklearn.metrics import confusion_matrix, f1_score

In [13]:
label_correct = test[label_column]
label_predicted = [classify(row) for _, row in test.iterrows()]

print(confusion_matrix(label_correct, label_predicted))
print(f1_score(label_correct, label_predicted))

[[38  9]
 [ 1 66]]
0.9295774647887323
