# Naive Bayes
- P(X|y) --> likelihood of feature given the class
- P(X) --> Evidence/ Total probability
- P(y) --> Prior probability of given class
- P(y|X) --> posterior probability of class y given input data X

- Task: find to posterior probability

- P(y|X) = P(X|y)*P(y) / P(X)
- posterior = likelihood * prior / total prob

- It is used when data are in continuous nature
- it uses gaussian distribution for likelihood estimation.

The probability density function (PDF) of a Gaussian distribution is given by:

$$
f(x \mid \mu, \sigma^2) = \frac{1}{\sqrt{2 \pi \sigma^2}} e^{-\frac{(x - \mu)^2}{2 \sigma^2}}
$$

Where:
- \( x \) is the variable
- \( \mu \) is the mean
- \( \sigma^2 \) is the variance (with \( \sigma \) as the standard deviation)


In [1]:
from collections import defaultdict
import numpy as np

In [2]:
class GaussianNaiveBayes:
    def __init__(self):
        self.classes = None # list of all the unique class lables
        self.class_priors = {} # prior prob of each classes
        self.mean = {}
        self.var = {}
    
    # It is used to fit input data to model
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        self.class_priors = defaultdict(float)
        self.mean = defaultdict(np.ndarray)
        self.var = defaultdict(np.ndarray)
        
        # for each class we will calculate --> prior prob, mean and var
        for c in self.classes:
            X_c = X[y == c]  # Select only rows where the class label is c
            # number of example devide by total examples for specific class prior
            self.class_priors[c] = X_c.shape[0]/n_samples
            self.mean[c]= np.mean(X_c, axis=0)
            self.var[c] = np.var(X_c, axis=0)
                    
    def _calculate_likelihood(self, mean, var, x):
        # Gaussian likelihood calculation
        eps = 1e-6  # Small epsilon to avoid division by zero
        coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
        exponent = np.exp(-(x - mean) ** 2 / (2.0 * var + eps))
        return coeff * exponent

    def _calculate_posterior(self, X):
        posterior_for_each_class = []
        for c in self.classes:
            #  for each class we will calculate log prior (log P(y))
            #  and also we will calculate log likelihood ( log (P(x|y)) ---> log(p(x1|y)*p(x2|y).....p(xn|y))) ---> (sum(log(xi|y)))
            #  log postirior = log(prior) + log(likelihood)
            prior = np.log(self.class_priors[c])
            likelihood = np.sum(np.log(self._calculate_likelihood(self.mean[c], self.var[c], X)))
            postirior = prior + likelihood
            posterior_for_each_class.append(postirior)
                    
        return self.classes[np.argmax(posterior_for_each_class)]
    
    def predict(self, X):
        return np.array([self._calculate_posterior(x) for x in X])

    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

In [3]:
# Importing necessary libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a dataset
X, y = make_classification(n_samples=1000, n_features=5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an instance of the Gaussian Naive Bayes classifier
gnb = GaussianNaiveBayes()

# Train the model
gnb.fit(X_train, y_train)

In [9]:
X_test[0]

array([ 1.25093198, -1.06429142, -2.23823123, -0.91454716,  1.26128601])

In [4]:
# Make predictions
y_pred = gnb.predict(X_test)

# Calculate accuracy
accuracy = gnb.accuracy(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.85


## Using inbuilt library...

In [5]:
# Importing necessary libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Generate a sample dataset with continuous features
X, y = make_classification(n_samples=1000, n_features=5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the Gaussian Naive Bayes model
gnb = GaussianNB()

# Fit the model on the training data
gnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Gaussian Naive Bayes Accuracy: {accuracy}')


Gaussian Naive Bayes Accuracy: 0.85


### It is used when data is not in continuous nature!