<a href="https://colab.research.google.com/github/Arcturus1804/Machine-Learning/blob/main/Email_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, issparse

class NaiveBayes:
    def __init__(self, X, y, binning=False):
        self.X = X.copy() if issparse(X) else csr_matrix(X)
        self.y = y.copy()
        self.bins = {}
        if binning:
            self.binvals(self.X)
        self.binning = binning
        self.conditionals = {}
        self.priors = {}
        self.prior_indices = {}
        self.get_priors()

    def binvals(self, values):
        for i in range(values.shape[1]):
            self.bins[i] = pd.qcut(values[:, i].toarray().flatten(), 4, retbins=True)[1]
            values[:, i] = pd.cut(
                values[:, i].toarray().flatten(), self.bins[i], labels=False, include_lowest=True
            )

    def get_priors(self):
        uniquevals = np.unique(self.y)
        for i in uniquevals:
            indices = np.where(self.y == i)[0]
            self.prior_indices[i] = indices
            self.priors[i] = indices.shape[0] / self.y.shape[0]

    def get_conditionals(self, observation, y_val):
        indices = self.prior_indices[y_val]
        size = self.X.shape[0]
        prob = 1
        for i in range(observation.shape[1]):
            values = self.X[indices, i].toarray().flatten()
            values = np.where(values == observation[:, i])[0]
            prob = prob * ((values.shape[0] / size))
        return prob

    def predict(self, observations):
        if self.binning:
            observations = observations.copy()
            for i in range(observations.shape[1]):
                observations[:, i] = pd.cut(
                    observations[:, i].toarray().flatten(), self.bins[i], labels=False, include_lowest=True
                )
        if observations.shape[1] != self.X.shape[1]:
            print("Error: observation columns not the same rank as data X")
            return None

        predictions = []
        columns = self.X.shape[1]
        for obs in range(observations.shape[0]):
            obs_prediction = []
            for key in list(self.priors.keys()):
                val = self.get_conditionals(observations[obs].reshape(1, columns), key)
                obs_prediction.append(val)
            predictions.append(np.argmax(obs_prediction))
        return np.array(predictions).reshape(
            len(predictions),
        )

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


X = data['text']
y = data['label']


vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

predictions = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
accuracy






Accuracy: 0.85


array([0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0])