In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam_ham_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    # lowercase
    text = text.lower()

    # remove Subject:, Re:, Fwd:
    text = re.sub(r'^(subject|re|fwd)\s*:\s*', '', text, flags=re.IGNORECASE)

    # remove emails
    text = re.sub(r'\S+@\S+', ' ', text)

    # remove urls
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # remove numbers
    text = re.sub(r'\d+', ' ', text)

    # remove non-alphabetic chars
    text = re.sub(r'[^a-z\s]', ' ', text)

    # tokenize
    tokens = text.split()

    # remove stopwords and short tokens, apply stemming
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words and len(w) > 1]

    return tokens

# apply preprocessing
df["tokens"] = df["text"].apply(clean_text)

print(df[["text", "tokens"]].head(3))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sabya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                                text  \
0  Subject: enron methanol ; meter # : 988291\r\n...   
1  Subject: hpl nom for january 9 , 2001\r\n( see...   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...   

                                              tokens  
0  [enron, methanol, meter, follow, note, gave, m...  
1  [hpl, nom, januari, see, attach, file, hplnol,...  
2  [neon, retreat, ho, ho, ho, around, wonder, ti...  


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label_num"], random_state=42)

print("Train size:", len(train_df))
print("Test size:", len(test_df))
print("Class distribution in train:")
print(train_df["label_num"].value_counts(normalize=True))
print("Class distribution in test:")
print(test_df["label_num"].value_counts(normalize=True))

Train size: 4136
Test size: 1035
Class distribution in train:
label_num
0    0.710106
1    0.289894
Name: proportion, dtype: float64
Class distribution in test:
label_num
0    0.710145
1    0.289855
Name: proportion, dtype: float64


In [None]:
from collections import Counter

# count of number of tokens in training data
token_counts = Counter()
for tokens in train_df["tokens"]:
    token_counts.update(tokens)

# minimum frequency filter
min_df = 2
vocab = {word for word, freq in token_counts.items() if freq >= min_df}

# mapping vocab words to indices
vocab = {word: idx for idx, word in enumerate(sorted(vocab))}
vocab_size = len(vocab)

print("Vocabulary size:", vocab_size)
print("Sample vocab entries:", list(vocab.items())[:20])

Vocabulary size: 14901
Sample vocab entries: [('aa', 0), ('aaa', 1), ('aalland', 2), ('aarhu', 3), ('aaron', 4), ('ab', 5), ('aba', 6), ('ababa', 7), ('abacha', 8), ('aback', 9), ('abacu', 10), ('abacustech', 11), ('abandon', 12), ('abash', 13), ('abat', 14), ('abazi', 15), ('abb', 16), ('abba', 17), ('abbasi', 18), ('abbi', 19)]


In [None]:
import numpy as np

def vectorize(tokens, vocab):
    vec = np.zeros(len(vocab), dtype=np.int64)
    for token in tokens:
        if token in vocab:
            vec[vocab[token]] += 1
    return vec

# vectorize training set
X_train = np.array([vectorize(tokens, vocab) for tokens in train_df["tokens"]])
y_train = train_df["label_num"].values

# vectorize test set
X_test = np.array([vectorize(tokens, vocab) for tokens in test_df["tokens"]])
y_test = test_df["label_num"].values

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (4136, 14901)
Test shape: (1035, 14901)


In [None]:
class NaiveBayesMultinomial:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = None
        self.likelihoods = None
        self.vocab_size = None
        self.classes = None

    def fit(self, X, y):
        n_docs, n_features = X.shape
        self.vocab_size = n_features
        self.classes = np.unique(y)

        # Priors: P(C) = count(C)/N
        class_counts = np.array([np.sum(y == c) for c in self.classes])
        self.class_priors = class_counts / n_docs

        # Likelihoods: P(word|C)
        self.likelihoods = {}
        for c in self.classes:
            X_c = X[y == c]                  # documents in class c
            word_count = np.sum(X_c, axis=0) # count of each word in class c
            total_count = np.sum(word_count) # total words in class c

            # Laplace smoothing
            probs = (word_count + self.alpha) / (total_count + self.alpha * n_features)
            self.likelihoods[c] = np.log(probs)   # store log for numerical stability

        # also store log priors
        self.log_priors = np.log(self.class_priors)

In [None]:
class NaiveBayesMultinomial(NaiveBayesMultinomial):  # extend the previous definition
    def predict(self, X):
        predictions = []
        for x in X:
            class_scores = {}
            for c in self.classes:
                # start with log prior
                score = self.log_priors[self.classes == c][0]
                # add log likelihood * count
                score += np.sum(x * self.likelihoods[c])
                class_scores[c] = score
            # pick class with highest score
            predictions.append(max(class_scores, key=class_scores.get))
        return np.array(predictions)

In [None]:
# train
nb = NaiveBayesMultinomial(alpha=1.0)
nb.fit(X_train, y_train)

# predict
y_pred = nb.predict(X_test)

# evaluate
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))

Accuracy: 0.970048309178744
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       735
        spam       0.94      0.95      0.95       300

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035



In [None]:
new_text = "win money"

tokens = clean_text(new_text)
vec = vectorize(tokens, vocab).reshape(1, -1)

pred = nb.predict(vec)[0]
print("Predicted class:", "spam" if pred == 1 else "ham")

Predicted class: spam
