In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
bodies_df = pd.read_csv("emails\dbworld_bodies_stemmed.csv")
subjects_df = pd.read_csv("emails\dbworld_subjects_stemmed.csv")

In [None]:
bodies_df.head()

In [None]:
subjects_df.head()

In [None]:
bodies_df.groupby("CLASS").describe()

In [None]:
subjects_df.groupby("CLASS").describe()

In [None]:
# Train test split
bodies_X = bodies_df.iloc[:, 1:-1]
bodies_y = bodies_df.iloc[:, -1:]
bodies_X_train, bodies_X_test, bodies_y_train, bodies_y_test = train_test_split(bodies_X, bodies_y, test_size = 0.2)

subjects_X = subjects_df.iloc[:, 1:-1]
subjects_y = subjects_df.iloc[:, -1:]
subjects_X_train, subjects_X_test, subjects_y_train, subjects_y_test = train_test_split(subjects_X, subjects_y, test_size = 0.2)

In [None]:
def F1_Score(y_pred, y_true):
    # True Positive, True Negative, False Positive, False Negative
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(y_pred)):
        if y_pred[i] == 1 and y_true[i] == 1:
            tp += 1
        elif y_pred[i] == 0 and y_true[i] == 0:
            tn += 1
        elif y_pred[i] == 1 and y_true[i] == 0:
            fp += 1
        elif y_pred[i] == 0 and y_true[i] == 1:
            fn += 1
            
    pre = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = (2 * pre * rec) / (pre + rec)
    return f1 

In [None]:
# Naive Bayes Classifier
class NaiveBayes:
    def __init__(self, k):
        self.class_probs = {}
        self.feature_probs = {}
        self.classes = None
        self.k = k

    def fit(self, X, y):
        self.classes = np.unique(y)
        
        # for every possible classification output, do this
        for cls in self.classes:
            # The row index for every row that is classified as the current class
            cls_indices = np.where(y == cls)[0]
            # The probability that a class is classified a certain way in the training data
            self.class_probs[cls] = len(cls_indices) / len(y)

            # Laplacian smoothing for feature probabilities, using the equation and algorithm discussed in class
            self.feature_probs[cls] = (X.iloc[cls_indices, :].sum(axis=0) + self.k) / (len(cls_indices) + self.k * len(X.columns))
            

    def predict(self, X):
        preds = []
        
        for _, sample in X.iterrows():
            probs = {}

            for cls in self.classes:
                # P(class | features) = P(class) * P(features | class) - the equation discussed in class
                probs[cls] = np.log(self.class_probs[cls]) + np.log(self.feature_probs[cls]).multiply(sample).sum()

            # Takes the maximum likelihood prediction based on the generated probabilities
            pred = pd.Series(probs).idxmax()
            preds.append(pred)

        return preds

In [None]:
# Test my implementation on body
bodies_bayes = NaiveBayes(k=1)
bodies_bayes.fit(bodies_X_train, bodies_y_train.values.ravel())
bodies_bayes_preds = bodies_bayes.predict(bodies_X_test)
print(f"F1 Score: {F1_Score(bodies_bayes_preds, bodies_y_test.to_numpy())}")

In [None]:
# Test my implementation on subject
subjects_bayes = NaiveBayes(k=1)
subjects_bayes.fit(subjects_X_train, subjects_y_train.values.ravel())
subjects_bayes_preds = subjects_bayes.predict(subjects_X_test)
print(f"F1 Score: {F1_Score(subjects_bayes_preds, subjects_y_test.to_numpy())}")

In [None]:
# Compare with sklearn implementation for bodies
bodies_bayes = MultinomialNB()
bodies_bayes.fit(bodies_X_train, bodies_y_train.values.ravel())
bodies_bayes_preds = bodies_bayes.predict(bodies_X_test)
print(f"F1 Score: {F1_Score(bodies_bayes_preds, bodies_y_test.to_numpy())}")

In [None]:
# Compare with sklearn implementation for subjects
subjects_bayes = MultinomialNB()
subjects_bayes.fit(subjects_X_train, subjects_y_train.values.ravel())
subjects_bayes_preds = subjects_bayes.predict(subjects_X_test)
print(f"F1 Score: {F1_Score(subjects_bayes_preds, subjects_y_test.to_numpy())}")