# Bayesian Inference classifier implementation
### imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Model specific Library for comparing
from sklearn.naive_bayes import GaussianNB

### Loding breast cancer dataset 

In [2]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()

### Data preprocessing and splitting

In [3]:
df = pd.DataFrame(
    np.c_[breast_cancer.data, breast_cancer.target],   # dataframe of data + target 
    columns = [list(breast_cancer.feature_names)+ ['target']])

x = df.iloc[:, 0:-1] # rest of the data ( 30 columns )
y = df.iloc[:,-1] # target: array of 1s and 0s , 0 -> cancer , 1 -> no cancer elhamdulelah


X_train, X_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 999) # splitting data to train and test
                                                                                             # test_size = 0.2 -> 20% of data is for testing

### Calculating PDF

In [4]:
def gaussian_pdf(x, mean, var):
    x = np.array(x, dtype=float)
    mean = np.array(mean, dtype=float)
    var = np.array(var, dtype=float)
    # Gaussian pdf formula
    coefficient = 1 / np.sqrt(2 * np.pi * var)
    exponent = -((x - mean) ** 2) / (2 * var)
    return coefficient * np.exp(exponent)


### Fitting the model

In [5]:
def predict(X):
    predictions = []
    X = np.array(X, dtype=float) # making sure of the format
    for x in X: # looping over each sample 
        posteriors = []
        for class_ , stats in class_features.items(): # stats is a dict that contains the 3 parameters for each feature within the class
            prior = np.log(stats['prior']) 
            likelihood = np.sum(np.log(gaussian_pdf(x, stats['mean'], stats['var'])))
            posterior = prior + likelihood     # sum the logs of prior and likelihood
            posteriors.append(posterior)
        predictions.append(np.argmax(posteriors))
    return np.array(predictions)

In [6]:
class_features = {}
for c in np.unique(y_train):
    features = X_train[y_train == c] # separate data according to classes
    class_features[c] = { # calculating parameters for each feature within the class
        'mean': features.mean(axis=0), 
        'var': features.var(axis=0),
        'prior': len(features) / len(X_train)
    }

### Testing time 

In [9]:
y_pred = predict(X_val.values)
accuracy = np.mean(y_pred == y_val)
print(f'our Model Accuracy : {accuracy}')

our Model Accuracy : 0.9298245614035088


### Sklearn accuracy

In [8]:
# sklearn Gaussian model
clf = GaussianNB()
clf.fit(X_train, y_train)
print("sklearn Model Accuracy  :", clf.score(X_val, y_val))

sklearn Model Accuracy  : 0.9210526315789473
