In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

The dataset can be downloaded from https://www.kaggle.com/kumargh/pimaindiansdiabetescsv

This dataset describes the medical records for Pima Indians
and whether or not each patient will have an onset of diabetes within five years.

Fields description follow:

preg = Number of times pregnant

plas = Plasma glucose concentration a 2 hours in an oral glucose tolerance test

pres = Diastolic blood pressure (**mm** Hg)

skin = Triceps skin fold thickness (mm)

test = 2-Hour serum insulin (mu U/ml)

mass = Body mass index (weight in kg/(height in m)^2)

pedi = Diabetes pedigree function

age = Age (years)

class = Class variable (1:tested positive for diabetes, 0: tested negative for diabetes) 

In [None]:
df = pd.read_csv('datasets_14370_19291_pima-indians-diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [None]:
X = df.drop('1',axis=1)
X.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


In [None]:
y = df['1']
y.head()

0    0
1    1
2    0
3    1
4    0
Name: 1, dtype: int64

In [None]:
#Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_scaled = scaler.transform(X_train)
X_te_scaled = scaler.transform(X_test)

# Naive Bayes step by step implementation

Calculating probability of occurence of classes (1:tested positive for diabetes, 0: tested negative for diabetes) a.k.a Class Priors that is the probabilities that a randomly chosen data point will be 1, or 0

In [None]:
p = (y_train ==  1).mean()
q = (y_train == 0).mean()
b = np.log(p/q)
print('bias : ',b)

bias :  -0.6518290312292144


*   b = 0 -> perfectly balanced dataset
*   b > 0 -> bias towards class 1
*   b < 0 -> bias towards class 0

Class occurrence counts.

In [None]:
C0 = X_tr_scaled[y_train == 0].sum(0)
C1 = X_tr_scaled[y_train == 1].sum(0)
C0

array([ -65.03191204, -133.49468241,  -39.57460281,  -24.51936966,
        -46.43610412,  -79.48442979,  -52.36114517,  -67.47982513])

 Then, given the knowledge that a data is classified as 1, the `conditional likelihood` that a datapoint will appear in class 1 is
$ L(t|1) = \frac{C^{1}_{t}}{N^1}$, 
 and simlarly, the `conditional likelihood` of a datapoin appearing in class 0 is 
 $ L(t|0) = \frac{C^{0}_{t}}{N^0}$

In [None]:
L0 = (C0 + 1)/ ((y_train == 0).sum() + 1)
L1 = (C1 + 1)/ ((y_train == 1).sum() + 1)

In [None]:
(L1/L0)*-1

array([1.97449625, 1.94359411, 2.0139641 , 2.07751025, 1.99897258,
       1.96348358, 1.98924994, 1.97229414])

In [None]:
R = np.log((L1/L0)*-1)
print(f' R: {R}')

 R: [0.6803133  0.66453889 0.70010497 0.73117018 0.69263334 0.67472023
 0.68775765 0.6791974 ]


In [None]:
preds_train =  b + X_tr_scaled@R >0
(preds_train == y_train.values).mean()

0.7096247960848288

In [None]:
preds_test =  b + X_te_scaled@R >0

(preds_test == y_test.values).mean()

0.6688311688311688

# Using scikit-learn implementation of NaiveBayes

In [None]:
model = GaussianNB()
model.fit(X_train.values,y_train.values)

GaussianNB()

In [None]:
preds_train = model.predict(X_train.values)
(preds_train == y_train.values).mean()

0.7601957585644372

In [None]:
preds_test = model.predict(X_test.values)
(preds_test == y_test.values).mean()

0.7467532467532467