In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('C:/Users/bhatt/OneDrive/Desktop/whisper-main/Assignments/spam_and_ham_classification.csv/spam_and_ham_classification.csv', nrows = 1000)

In [3]:
X = data['text'].values
Y = data['label'].values

vectorizer = CountVectorizer()
X_vectorised = vectorizer.fit_transform(X).toarray()

X_train, X_test, Y_train, Y_test = train_test_split(X_vectorised, Y, test_size = 0.2, random_state = 100)

def prior_prob(Y_train, label):
    return np.sum(Y_train == label) / Y_train.shape[0]

def likelihood(X_train, Y_train, feat_col, feat_val, label):
    X_train_label = X_train[Y_train == label]
    numerator = np.sum(X_train_label[:, feat_col] == feat_val)
    denominator = np.sum(Y_train == label)
    return numerator / denominator if denominator != 0 else 1

def posterior(X_train, Y_train, X_test):
    classes = np.unique(Y_train)
    post_prob = []

    for label in classes:
        cond_prob = 1

        for i in range(X_train.shape[1]):
            prob = likelihood(X_train, Y_train, i, X_test[i], label)
            cond_prob *= prob

        prior = prior_prob(Y_train, label)
        posterior_prob = cond_prob * prior
        post_prob.append(posterior_prob)

    return np.argmax(post_prob), post_prob

def predict(X_train, Y_train, X_test):
    preds = []
    
    for i in range(X_test.shape[0]):
        label, _ = posterior(X_train, Y_train, X_test[i])
        preds.append(label)
    return np.array(preds)

Y_pred = predict(X_train, Y_train, X_test)

acc = accuracy_score(Y_test, Y_pred)

print('Accuracy:', acc)
print('Classification Report', classification_report(Y_test, Y_pred))

KeyboardInterrupt: 