In [34]:
import numpy as np
from sklearn.datasets import make_blobs
from dataclasses import dataclass
import sklearn
from scipy.stats import norm
from sklearn.model_selection import train_test_split

In [35]:
make_blobs?

[0;31mSignature:[0m
[0mmake_blobs[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_samples[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_features[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcenters[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcluster_std[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcenter_box[0m[0;34m=[0m[0;34m([0m[0;34m-[0m[0;36m10.0[0m[0;34m,[0m [0;36m10.0[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreturn_centers[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generate isotropic Gaussian blobs for clustering.

Read more in the :ref:`User Gu

In [36]:
X,y = make_blobs(n_samples = 10000, n_features = 2, centers = 2)

In [37]:
print(X[:5])

[[ 5.74905974 -3.98765136]
 [ 9.27196757 -5.13814732]
 [ 6.30101368 -4.23083817]
 [10.44921614 -6.65233778]
 [10.49850066 -5.74158534]]


In [38]:
print(y[:5])

[0 1 0 1 1]


Points about NB:
- Generative model
- Non-parametric model
- It assumes a gaussian distribution for every column of each class
- It naively assumes that columns are independent
  

- It doesn't have any hyper-parameters

What do we need:
- Prior Distribution
- Likelihood
- Posterior ~ Likelihood * Prior
- Datasplit

Gaussian_PDF = 1/(sqrt(2 * pi * var) * (exp(-(x_i - mean)**2/2var))
p(y_i = k | x_i) ~ P(x_i_0 | y = k) * P(x_i_1 | y = k) * P(y=k)


In [39]:
@dataclass
class GaussianNaiveBayes:
    # def __init__(self, X, y):
    #     self.X = X
    #     self.y = y
    X: np.array
    y: np.array

    def __post_init__(self): #series of steps that you want your class object to execute/ initiate as soon as someone initiates the class without a need to ask the user
        self.dataSplit()
        
        self.X0_train = self.X_train[self.y_train == 0]
        self.X1_train = self.X_train[self.y_train == 1]
        self.y0_train = self.y_train[self.y_train == 0]
        self.y1_train = self.y_train[self.y_train == 1]

        self.fit()
        self.predict()
    #a new thing taught about data class

    def dataSplit(self):
        self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split(self.X, self.y, test_size = 0.3, shuffle = True)

    def fit_distribution(self, x): #we are passing one column at a time
        mean = np.mean(x)
        std = np.std(x)
        dist = norm(mean, std)
        return dist 

    def posterior(self, x, prior, dist_col1, dist_col2):
        return prior * dist_col1.pdf(x[0]) * dist_col2.pdf(x[1]) # dist_col1.pdf(x[0]) is the likelihood of that point in that distribution
        
    def fit(self):
        self.prior_y0 = len(self.y0_train) / len(self.y)
        self.prior_y1 = len(self.y1_train) / len(self.y)

        #Distribution of every column in each class
        
        self.dist_X_0_0 = self.fit_distribution(self.X0_train[:,0])
        self.dist_X_0_1 = self.fit_distribution(self.X0_train[:,1])

        self.dist_X_1_0 = self.fit_distribution(self.X1_train[:,0])
        self.dist_X_1_1 = self.fit_distribution(self.X1_train[:,1])

    def predict(self):

        self.error_count = 0

        for sample, target in zip(self.X_test, self.y_test):
            py0 = self.posterior(sample, self.prior_y0, self.dist_X_0_0, self.dist_X_0_1)
            py1 = self.posterior(sample, self.prior_y1, self.dist_X_1_0, self.dist_X_1_1)

            #print('P(y=0 | %s) = %3.f' % (sample, py0 * 100))
            #print('P(y=1 | %s) = %3.f' % (sample, py1 * 100))

            if np.argmax([py0,py1]) != target:
                self.error_count += 1
        
            #print(" Model predicted class {} and the truth was: {} \n".format(np.argmax([py0,py1]), target))

        print(self.error_count)
        print(len(self.X))

In [40]:
nb = GaussianNaiveBayes(X,y)

62
10000
