In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
dataset = pd.read_csv("emails.csv")
print(dataset.shape)

(5728, 2)


In [5]:
random_samples = dataset.sample(10)
print(random_samples)

                                                   text  label
1141  Subject: 5000 full color postcards for $ 329  ...      1
3129  Subject: re : price processes for ng  grant & ...      0
785   Subject: any med for your girl to be happy !  ...      1
2599  Subject: great divide lodge  vince and shirley...      0
4529  Subject: earth day - trash bash  i hardly know...      0
3138  Subject: risk boston  please read the attached...      0
550   Subject: renew your vitality  for the first ti...      1
5171  Subject: the garp 2001 convention : gentle rem...      0
328   Subject: just to her . . .  your message to tj...      1
1250  Subject: re :  good day ,  everybody will love...      1


In [6]:
spam = dataset[dataset["label"] == 1]
ham = dataset[dataset["label"] == 0]
print("Spam Percentage =", (len(spam) / len(dataset)) * 100, "%")
print("Ham Percentage =", (len(ham) / len(dataset)) * 100, "%")

Spam Percentage = 23.88268156424581 %
Ham Percentage = 76.11731843575419 %


In [7]:
# Splitting the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    dataset["text"], dataset["label"], test_size=0.3, random_state=42, shuffle=True
)

# Checking the sizes of train and test sets
print("Train data size:", train_data.shape[0])
print("Test data size:", test_data.shape[0])

Train data size: 4009
Test data size: 1719


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import re


# Step 1: Data Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    return text


# Preprocess train emails
preprocessed_train_emails = [preprocess_text(text) for text in train_data]

# Preprocess test emails
preprocessed_test_emails = [preprocess_text(text) for text in test_data]

preprocessed_train_emails[0]

'subject volatility curves  linked from reuters  hi tanya   attached are the live reuters  linked volatility curves  please don  t  re  establish links  as i don  t think that your telerate connection works in  the same way as ours in london  i will get back to you on cleaning up the  historical forward curve database as i complete each metal  we can talk at  5 pm as we agreed   regards   anjam  p  s  i think the fast dial is  830 5383 or 830 35383'

In [9]:
# Step 2: Building the Vocabulary
vocabulary = set()
for email in preprocessed_train_emails:
    words = word_tokenize(email)  # Tokenization using NLTK
    vocabulary.update(words)

In [10]:
# Step 3: Calculating Prior Probabilities
spam_count = sum(1 for label in train_labels if label == 1)
ham_count = sum(1 for label in train_labels if label == 0)
total_count = len(train_labels)

prior_spam = spam_count / total_count
prior_ham = ham_count / total_count
print("total_count=", total_count)
print("prior_spam=", prior_spam)
print("prior_ham=", prior_ham)

total_count= 4009
prior_spam= 0.23122973310052383
prior_ham= 0.7687702668994761


In [11]:
# Step 4: Calculating Conditional Probabilities
spam_word_counts = FreqDist()
ham_word_counts = FreqDist()

for email, label in zip(preprocessed_train_emails, train_labels):
    words = word_tokenize(email)
    if label == 1:
        spam_word_counts.update(words)
    else:
        ham_word_counts.update(words)

spam_word_counts

FreqDist({'_': 9365, 'the': 6466, 'to': 5614, 'and': 4645, 'of': 4022, 'you': 3394, 'a': 3257, 'in': 2787, 'your': 2534, 'for': 2216, ...})

In [13]:
import math


# Step 5: Implementing the Naive Bayes Classifier
def calculate_conditional_probabilities(word_counts):
    # Calculate conditional probabilities with Laplace smoothing
    probabilities = {}
    vocab_size = len(vocabulary)
    total_count = sum(word_counts.values())

    for word in vocabulary:
        count = word_counts[word]
        probability = (count + 1) / (total_count + vocab_size)
        probabilities[word] = probability

    return probabilities


spam_word_probs = calculate_conditional_probabilities(spam_word_counts)
ham_word_probs = calculate_conditional_probabilities(ham_word_counts)

In [14]:
def predict(email):
    words = word_tokenize(preprocess_text(email))
    log_prob_spam = math.log(prior_spam)
    log_prob_ham = math.log(prior_ham)

    for word in words:
        if word in vocabulary:
            log_prob_spam += math.log(spam_word_probs[word])
            log_prob_ham += math.log(ham_word_probs[word])

    return 1 if log_prob_spam > log_prob_ham else 0

In [15]:
# Step 6: Predict labels for test emails
predicted_labels = [predict(email) for email in preprocessed_test_emails]

# Calculate accuracy
correct_predictions = sum(
    1 for predicted, actual in zip(predicted_labels, test_labels) if predicted == actual
)
accuracy = correct_predictions / len(test_labels)

print("Accuracy:", accuracy)

Accuracy: 0.987783595113438
