In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

- it is a generative classification
- P( y = k | x = x') = P(y) * P( x = x' | y = k) #posterior = prior * likelihood

In [2]:
num_rows = 1000 

# Create a DataFrame with specified number of rows
table = pd.DataFrame(index=range(num_rows), columns=['x1', 'x2', 'y'])

# Generate random integers using apply() and a lambda function
table['x1'] = table.apply(lambda _: np.random.randint(0, 2), axis=1)
table['x2'] = table.apply(lambda _: np.random.randint(0, 2), axis=1)
table['y'] = table.apply(lambda _: np.random.randint(0, 2), axis=1)
table

Unnamed: 0,x1,x2,y
0,0,1,1
1,0,0,1
2,1,0,1
3,1,0,1
4,0,1,0
...,...,...,...
995,1,1,0
996,1,0,0
997,0,0,1
998,1,0,0


In [3]:
# Defining target and features
X = table.drop('y', axis=1)  # Replace 'TargetColumn' with the name of your target column
y = table['y']

In [187]:
class NaiveBayes:
    def __init__(self, X, y, a):
        self.X = X
        self.y = y
        self.a = a #laplace smoothing constant

    def data_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.3, shuffle = True)
        
    def split_class(self):
        self.X_0 = self.X_train[self.y_train == 0]
        self.X_1 = self.X_train[self.y_train == 1]

    def prior(self):
        self.prior_0 = len(self.X_0) / len(self.X_train)
        self.prior_1 = len(self.X_1) / len(self.X_train)
    

    def likelihood(self, feature_value, feature_index, class_label):
        # Calculating the likelihood for a single feature value given the class label
        if class_label == 0:
            feature_count = self.X_0.iloc[:, feature_index].value_counts().get(feature_value, 0)
            total = len(self.X_0)
        else:
            feature_count = self.X_1.iloc[:, feature_index].value_counts().get(feature_value, 0)
            total = len(self.X_1)
        
        # Applying Laplace smoothing
        return (feature_count + self.a) / (total + self.a * len(np.unique(self.X)))

    
    def posterior(self, prior, X):
        return prior * self.likelihood(X)

    def predict(self):
        self.error_count = 0
        predictions = []

        for (index, sample), target in zip(self.X_test.iterrows(), self.y_test):
            py0 = self.prior_0
            py1 = self.prior_1
            for feature_index in range(self.X_train.shape[1]):  # Assuming features are numerical and start at 0
                py0 *= self.likelihood(sample[feature_index], feature_index, 0)
                py1 *= self.likelihood(sample[feature_index], feature_index, 1)
            
            print('P(y=0 | x1={}, x2={}) = {:.2f}%'.format(sample[0], sample[1], py0*100))
            print('P(y=1 | x1={}, x2={}) = {:.2f}%'.format(sample[0], sample[1], py1*100))

            # Predict based on which probability is greater
            prediction = 0 if py0 > py1 else 1
            predictions.append(prediction)
            print(" Model predicted class {} and the truth was: {} \n".format(prediction, target))
            
            if prediction != target:
                self.error_count += 1

        accuracy = (len(predictions) - self.error_count) / len(predictions)
        print("Accuracy: {:.2f}%".format(accuracy * 100))


    def fit(self):
        self.data_split()
        self.split_class()
        self.prior()
        self.predict()
        
        

In [188]:
nb = NaiveBayes(X, y, 1)

In [189]:
nb.fit()

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=1) = 25.50%
 Model predicted class 1 and the truth was: 1 

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=1) = 25.50%
 Model predicted class 1 and the truth was: 0 

P(y=0 | x1=0, x2=0) = 25.77%
P(y=1 | x1=0, x2=0) = 25.93%
 Model predicted class 1 and the truth was: 0 

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=1) = 25.50%
 Model predicted class 1 and the truth was: 0 

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=1) = 25.50%
 Model predicted class 1 and the truth was: 1 

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=1) = 25.50%
 Model predicted class 1 and the truth was: 1 

P(y=0 | x1=0, x2=0) = 25.77%
P(y=1 | x1=0, x2=0) = 25.93%
 Model predicted class 1 and the truth was: 0 

P(y=0 | x1=0, x2=0) = 25.77%
P(y=1 | x1=0, x2=0) = 25.93%
 Model predicted class 1 and the truth was: 1 

P(y=0 | x1=0, x2=0) = 25.77%
P(y=1 | x1=0, x2=0) = 25.93%
 Model predicted class 1 and the truth was: 0 

P(y=0 | x1=0, x2=1) = 22.51%
P(y=1 | x1=0, x2=

  py0 *= self.likelihood(sample[feature_index], feature_index, 0)
  py1 *= self.likelihood(sample[feature_index], feature_index, 1)
  print('P(y=0 | x1={}, x2={}) = {:.2f}%'.format(sample[0], sample[1], py0*100))
  print('P(y=1 | x1={}, x2={}) = {:.2f}%'.format(sample[0], sample[1], py1*100))
