# Gaussian Naive Bayes

# Imports

In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt

sys.path.append('../')

from utils.preprocessing import train_test_split, kfolds_cross_validation
from utils.output_utils import do_cv_and_get_metrics_classification

# Process

In [2]:
data = np.genfromtxt('./data/breastcancer.csv', delimiter=',')
print('Shape:', data.shape)
data[:2,:]
X = data[:, :-1]
y = data[:, [-1]]

Shape: (569, 31)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 0.8, random_seed=64825)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (455, 30)
y_train shape: (455, 1)
X_test shape: (114, 30)
y_test shape: (114, 1)


In [4]:
class MyGaussianNaiveBayes():
    def __init__(self):        
        pass

    def fit(self, X, y):

        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        
        classes = np.unique(y)
        self.class_to_idx_dict = {classes[i]: i for i in range(len(classes))}

        n_features = X.shape[1]

        self.prob_class = np.zeros((len(classes), 1)) # n_classes x 1
        self.mu = np.zeros((len(classes), n_features)) # n_classes x n_features
        self.std = np.zeros((len(classes), n_features)) # n_classes x n_features

        for label in classes:

            k = self.class_to_idx_dict[label]

            X_class = X[np.where(y==label)[0], :]
            y_class = y[np.where(y==label)[0], :]
            
            self.prob_class[k] = len(y_class) / len(y)
            self.mu[k] = np.mean(X_class, axis=0)
            self.std[k] = np.std(X_class, axis=0)


    def predict(self, X):

        idx_to_class = {v: k for k, v in self.class_to_idx_dict.items()}
        prob_classes = np.zeros((X.shape[0], len(idx_to_class)))

        for i, label in enumerate(idx_to_class.values()):

            k = self.class_to_idx_dict[label]
            mu = self.mu[[k]]
            std = self.std[[k]]
            prior = self.prob_class[k]

            for idx, x in enumerate(X):

                first_part = np.log(prior)
                second_part = -(1/2) * np.sum(np.log(2*np.pi*(std**2)), axis=1)
                third_part = -(1/2) * np.sum(((x - mu)**2)/(std**2), axis=1)
                
                pred = first_part + second_part + third_part
                prob_classes[idx, i] = pred
                

        preds = []

        for i in range(prob_classes.shape[0]):
            argmax = np.argmax(prob_classes[i, :])
            preds.append(idx_to_class[argmax])
            
        return np.array(preds).reshape(-1, 1)

In [5]:
do_cv_and_get_metrics_classification(
    classifier=MyGaussianNaiveBayes(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    title='My Gaussian Naive Bayes'
)

#------------------My Gaussian Naive Bayes-------------------#

--->	Training Metrics
Accuracy Mean:     	0.9355 | Accuracy Std:   	0.0058
Recall Mean:     	0.9664 | Recall Std:       	0.0065
Precision Mean:     	0.9307 | Precision Std:   	0.0046
F1 Score Mean:     	0.9482 | F1 Score Std:   	0.0046

--->	Validation Metrics
Accuracy Mean:     	0.9275 | Accuracy Std:   	0.0446
Recall Mean:     	0.9531 | Recall Std:       	0.0574
Precision Mean:     	0.9295 | Precision Std:   	0.0428
F1 Score Mean:     	0.9402 | F1 Score Std:   	0.0408

--->	Test Metrics
Accuracy:     	0.9649
Recall:     	0.9747
Precision:     	0.9747
F1 Score:     	0.9747


  prob_classes[idx, i] = pred


In [6]:
# TO COMPARE
from sklearn.naive_bayes import GaussianNB

do_cv_and_get_metrics_classification(
    classifier=GaussianNB(var_smoothing=1e-13),
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    title='Sklearn - Gaussian Naive Bayes'
)

#---------------Sklearn - Gaussian Naive Bayes---------------#

--->	Training Metrics
Accuracy Mean:     	0.9355 | Accuracy Std:   	0.0058
Recall Mean:     	0.9664 | Recall Std:       	0.0065
Precision Mean:     	0.9307 | Precision Std:   	0.0046
F1 Score Mean:     	0.9482 | F1 Score Std:   	0.0046

--->	Validation Metrics
Accuracy Mean:     	0.9275 | Accuracy Std:   	0.0446
Recall Mean:     	0.9531 | Recall Std:       	0.0574
Precision Mean:     	0.9295 | Precision Std:   	0.0428
F1 Score Mean:     	0.9402 | F1 Score Std:   	0.0408

--->	Test Metrics
Accuracy:     	0.9649
Recall:     	0.9747
Precision:     	0.9747
F1 Score:     	0.9747
