In [31]:
!pip install --upgrade openpyxl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from scipy.stats import norm
import random

In [33]:
df = pd.DataFrame(np.random.randint(0,2,size=(1000, 3)), columns = ['X1','X2','y'])
df

Unnamed: 0,X1,X2,y
0,1,0,1
1,1,0,0
2,1,0,0
3,0,0,0
4,1,0,1
...,...,...,...
995,1,0,1
996,0,0,0
997,1,1,0
998,1,0,1


In [35]:
class NaiveBayes:
    def __init__(self,X,y,alpha,LS = False) -> None:
        self.X = X
        self.y = y 
        self.LS = LS
        self.alpha = alpha
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split (X,y, test_size = 0.3, random_state = 0)
            
    def probability(self,X,prior,likearr):
        
        Xhat1, Xhat2 = np.split(X,2)
        Xhat1, Xhat2 = int(Xhat1), int(Xhat2)
        
        if Xhat1 == 1: 
            if Xhat2 == 1:
                return prior*likearr[1]*likearr[3]
            else: 
                return prior*likearr[1]*likearr[2]
        else: 
            if Xhat2 == 1:
                return prior*likearr[0]*likearr[3]
            else: 
                return prior*likearr[0]*likearr[2]
    
    def run_model(self):
        if self.LS == True: 
            k = self.X_train.shape[1]
            print(k)
        else: 
            k= 0 
            alpha = 0
        
        #calculate priors
        self.X0_train = self.X_train[self.y_train ==0]
        self.X1_train = self.X_train[self.y_train ==1] 
               
        self.prior_y0 = len(self.X0_train) / len(self.X_train)
        self.prior_y1 = len(self.X1_train) / len(self.X_train)
        
        print("y0", self.prior_y0)
        print("y1",self.prior_y1)
        
        #split X1 and X2 into arrays, for both y=0 and y=1
        self.X1_X0_train, self.X2_X0_train = zip(*self.X0_train) #split when y = 0 into x1 and x2 
        self.X1_X1_train, self.X2_X1_train = zip(*self.X1_train) #split when y = 1 into x1 and x2 
        self.X1_X0_train = np.asarray(self.X1_X0_train) #convert tuples to arrays
        self.X2_X0_train = np.asarray(self.X2_X0_train)
        self.X1_X1_train = np.asarray(self.X1_X1_train)
        self.X2_X1_train = np.asarray(self.X2_X1_train)
        
        
        #calculate likelihoods 
        self.X1_X0_0 = self.X1_X0_train[self.X1_X0_train==0] #X1 = 0, when y=0
        self.X1_X1_0 = self.X1_X0_train[self.X1_X0_train==1] #X1 = 1, when y=0
        
        self.likehood_X1_0_y0 = (len(self.X1_X0_0)+ alpha) / (len(self.X0_train) + k*alpha)
        self.likehood_X1_1_y0 = (len(self.X1_X1_0)+ alpha) / (len(self.X0_train) + k*alpha)
        #print("X1=0 |y0 ", self.likehood_X1_0_y0)
        #print("X1=1 |y0 ", self.likehood_X1_1_y0)
        
        self.X2_X0_0 = self.X2_X0_train[self.X2_X0_train==0]  #X2 = 0, when y = 0 
        self.X2_X1_0 = self.X2_X0_train[self.X2_X0_train==1]  #X2 = 1, when y = 0 
        
        self.likehood_X2_0_y0 = (len(self.X2_X0_0)+alpha) / (len(self.X0_train)+ k*alpha)
        self.likehood_X2_1_y0 = (len(self.X2_X1_0)+alpha) / (len(self.X0_train)+ k*alpha)
        #print("X2=0 |y0 ",self.likehood_X2_0_y0)
        #print("X2=1 |y0 ",self.likehood_X2_1_y0)
        
        
        self.X1_X0_1 = self.X1_X1_train[self.X1_X1_train==0]   #X1 = 0, when y =1
        self.X1_X1_1 = self.X1_X1_train[self.X1_X1_train==1]   #X1 = 1, when y =1
        
        self.likehood_X1_0_y1 = (len(self.X1_X0_1)+alpha) /(len(self.X1_train)+ k*alpha)
        self.likehood_X1_1_y1 = (len(self.X1_X1_1)+alpha) /(len(self.X1_train)+ k*alpha)
        #print("X1=0 |y1 ", self.likehood_X1_0_y1)
        #print("X1=1 |y1 ", self.likehood_X1_1_y1)
        
        self.X2_X0_1 = self.X2_X1_train[self.X2_X1_train==0]  #X2 = 0, when y = 1
        self.X2_X1_1 = self.X2_X1_train[self.X2_X1_train==1]  #x2 = 1, when y = 1      
        
        self.likehood_X2_0_y1 = (len(self.X2_X0_1)+ alpha)/(len(self.X1_train)+ k*alpha)
        self.likehood_X2_1_y1 = (len(self.X2_X1_1)+ alpha)/(len(self.X1_train)+ k*alpha)
        #print("X2=0 |y1 ",self.likehood_X2_0_y1)
        #print("X2=1 |y1 ",self.likehood_X2_1_y1)
        
        #create an array to hold the values to pass through function
        self.likelihoodarr = np.array([[self.likehood_X1_0_y0,self.likehood_X1_1_y0,
                                        self.likehood_X2_0_y0,self.likehood_X2_1_y0],
                                        [self.likehood_X1_0_y1,self.likehood_X1_1_y1,
                                        self.likehood_X2_0_y1,self.likehood_X2_1_y1]])
        print(self.likelihoodarr)
        
    def predict(self):
        for sample, target in zip(self.X_test, self.y_test): 
            py0 = self.probability(sample, self.prior_y0, self.likelihoodarr[0])
            py1 = self.probability(sample, self.prior_y1, self.likelihoodarr[1])
            
            print("P(y=0|%s) = %.f" % (sample, py0*100))
            print("P(y=1|%s) = %.f" % (sample, py1*100))
            
            print('Model predicted class {} and the truth was: {}\n'.format(np.argmax([py0*100, py1*100]),target))

In [36]:
nb = NaiveBayes(df.values[:,0:-1],df.values[:,-1], alpha = 10**-8,LS = False)
nb.run_model()

y0 0.5042857142857143
y1 0.4957142857142857
[[0.52124646 0.47875354 0.46458924 0.53541076]
 [0.54466859 0.45533141 0.4610951  0.5389049 ]]


In [37]:
nb.predict()

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 1

P(y=0|[0 1]) = 14
P(y=1|[0 1]) = 15
Model predicted class 1 and the truth was: 0

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[0 1]) = 14
P(y=1|[0 1]) = 15
Model predicted class 1 and the truth was: 1

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[0 1]) = 14
P(y=1|[0 1]) = 15
Model predicted class 1 and the truth was: 1

P(y=0|[1 1]) = 13
P(y=1|[1 1]) = 12
Model predicted class 0 and the truth was: 0

P(y=0|[1 1]) = 13
P(y=1|[1 1]) = 12
Model predicted class 0 and the truth was: 0

P(y=0|[1 0]) = 11
P(y=1|[1 0]) = 10
Model predicted class 0 and the truth was: 0

P(y=0|[0 1]) = 1