In [1]:
import pandas as pd

In [3]:
#Load the dataset
URL = "./SMSSpamCollection"
df = pd.read_csv(URL, sep='\t', header=None)
df.columns = ['label', 'sms_message']

# map the label to 0 and 1
df['label'] = df['label'].map({'ham':0, 'spam':1})

#convert message to lower case
df['sms_message'] = df['sms_message'].str.lower()
df.head()

Unnamed: 0,label,sms_message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict
import math

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], test_size=0.3, random_state=42)

In [7]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_prob = {}
        self.word_prob = defaultdict(lambda: defaultdict(lambda: 1))
        self.vocabulary = set()
        self.total_docs = 0
        self.total_words = defaultdict(int)
        self.doc_count = defaultdict(int)
    
    def fit(self, X, y):
        # Count document occurrences and words
        for doc, label in zip(X, y):
            self.total_docs += 1
            self.doc_count[label] += 1
            words = doc.split()
            self.total_words[label] += len(words)
            
            for word in words:
                self.vocabulary.add(word)
                self.word_prob[label][word] += 1
        
        # Calculate prior probabilities
        for label in self.doc_count:
            self.class_prob[label] = self.doc_count[label] / self.total_docs
    
    def predict(self, X):
        predictions = []
        for doc in X:
            log_probs = {}
            words = doc.split()
            for label in self.doc_count:
                log_prob = math.log(self.class_prob[label])
                for word in words:
                    word_freq = self.word_prob[label][word]
                    word_prob = word_freq / self.total_words[label]
                    log_prob += math.log(word_prob)
                
                log_probs[label] = log_prob
            
            # Choose the label with the highest log probability
            predictions.append(max(log_probs, key=log_probs.get))
        
        return predictions

In [8]:
# Train and evaluate the model
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

# Evaluate accuracy
accuracy = sum([1 for true, pred in zip(y_test, y_pred) if true == pred]) / len(y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9157
