In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import matplotlib as plt
from scipy.stats import norm
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=10000, n_features=2, centers = 2)


(10000, 2)

In [5]:
print(X)

print(X.shape)

[[-3.87551016 -7.31229405]
 [-3.53371397 -8.06872722]
 [-5.31390009 -6.04107791]
 ...
 [-0.16507849 -6.43015525]
 [-4.34834213 -7.00728222]
 [-2.13333118 -7.23296553]]
(10000, 2)


In [3]:
y

array([0, 0, 0, ..., 1, 0, 1])

In [13]:
@dataclass
class GaussianNaiveBayes:
    X: np.matrix
    y: np.array
    
    def __post_init__(self):
        self.dataSplit()
        self.fit()

    def dataSplit(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = 0.3)

    def fitDistribution(self, data):
        mean = np.mean(data)
        std = np.std(data)
        dist = norm(mean, std)
        return dist
    
    def posterior(self, data, prior, dist0, dist1):
        return prior * dist0.pdf(data[0]) * dist1.pdf(data[1])
    
    def fit(self):
        X0_data = self.X_train[self.y_train == 0]
        X1_data = self.X_train[self.y_train == 1]
        
        self.prior_0 = len(X0_data) / len(self.X_train)
        self.prior_1 = len(X1_data) / len(self.X_train)
        
        self.dist_X00 = self.fitDistribution(X0_data[:,0])
        self.dist_X01 = self.fitDistribution(X0_data[:,1])
        self.dist_X10 = self.fitDistribution(X1_data[:,0])
        self.dist_X11 = self.fitDistribution(X1_data[:,1])

    def predict(self):
        
        for sample, target in zip(self.X_test, self.y_test):
            py0 = self.posterior(sample, self.prior_0, self.dist_X00, self.dist_X01) 
            py1 = self.posterior(sample, self.prior_1, self.dist_X10, self.dist_X11) 
            
            print("P(y=0|%s) = %f"%(sample,py0))
            print("P(y=1|%s) = %f"%(sample,py1))
            
            print("the point belongs to class{} and predicted{}".format(target, np.argmax([py0,py1])))

In [12]:
clf = GaussianNaiveBayes(X,y)

In [14]:
clf.predict()

P(y=0|[-5.16550641 -8.00064212]) = 0.041392
P(y=1|[-5.16550641 -8.00064212]) = 0.000648
the point belongs to class0 and predicted0
P(y=0|[-4.98480064 -8.42190807]) = 0.030365
P(y=1|[-4.98480064 -8.42190807]) = 0.000458
the point belongs to class0 and predicted0
P(y=0|[-5.65855053 -7.96121229]) = 0.025355
P(y=1|[-5.65855053 -7.96121229]) = 0.000169
the point belongs to class0 and predicted0
P(y=0|[-3.73258732 -5.36565476]) = 0.012933
P(y=1|[-3.73258732 -5.36565476]) = 0.027069
the point belongs to class0 and predicted1
P(y=0|[-4.34422167 -7.35097468]) = 0.079587
P(y=1|[-4.34422167 -7.35097468]) = 0.009618
the point belongs to class0 and predicted0
P(y=0|[-1.1079441  -6.05912486]) = 0.000206
P(y=1|[-1.1079441  -6.05912486]) = 0.024752
the point belongs to class1 and predicted1
P(y=0|[-1.55275051 -5.33434747]) = 0.000276
P(y=1|[-1.55275051 -5.33434747]) = 0.028803
the point belongs to class1 and predicted1
P(y=0|[-3.81156336 -5.87670744]) = 0.029982
P(y=1|[-3.81156336 -5.87670744]) = 0.03