In [38]:
import numpy as np

class BernoulliNaiveBayes:
    def __init__(self):
        self.class_prior_ = {}
        self.feature_prob_ = {}

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes_ = np.unique(y)

        for c in self.classes_:
            class_mask = (y == c)
            class_samples = X[class_mask]

            # Calculate class prior probability
            self.class_prior_[c] = class_samples.shape[0] / n_samples

            # Calculate feature probabilities for each class
            self.feature_prob_[c] = (class_samples.sum(axis=0) + 1) / (class_samples.shape[0] + 2)

    def predict(self, X):
        n_samples, n_features = X.shape
        predictions = []

        for i in range(n_samples):
            sample = X[i, :]
            class_scores = {}

            for c in self.classes_:
                class_scores[c] = np.log(self.class_prior_[c]) + np.sum(
                    np.log(self.feature_prob_[c]) * sample +
                    np.log(1 - self.feature_prob_[c]) * (1 - sample)
                )

            # Find the class with the highest score
            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)

        return np.array(predictions)

    def score(self, X, y):
        y_pred = self.predict(X)
        return (y_pred == y).mean()


In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

# Load MNIST dataset
mnist = fetch_openml("mnist_784")

# Convert data to boolean values using a threshold
threshold = 128  # You can adjust this threshold as needed
X = (mnist.data.astype(int) >= threshold).astype(bool)

# Extract the data and target (labels)
y = (mnist.target.astype(float) >= 5).astype(bool)  # For binary classification (e.g., 5 or not 5)



  warn(


In [39]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)
#X_train, X_test, y_train, y_test = train_test_split(data_pca, target, test_size=0.2, random_state=42)

In [37]:
Xtest


Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
6670,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49567,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
50796,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22310,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54037,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32138,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
53648,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64554,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
33812,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [41]:
model = BernoulliNaiveBayes()
model.fit(Xtrain, ytrain)
accuracy = model.score(Xtest.values, ytest.values)
print(f"Testing accuracy: {accuracy:.2%}")

Testing accuracy: 76.14%
