<a href="https://colab.research.google.com/github/Demon-Sheriff/Linear-Alg_ML_fs/blob/master/Naive_bayes_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [213]:
EPSILON = 1e-10  # Small value to avoid log(0)

In [214]:
import numpy as np

""" we can maintain each unique word in a trie and also store the frequency of each word class_wise [LATER]
    TODO:
        1. make all the array operations vectorized
        2. implement trie for word processing
"""

class NaiveBayesSpamClassifier:

    def __init__(self):
        self.class_priors = None
        self.feature_likelihoods = None
        self.n_classes = None

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)

        self.n_classes = len(np.unique(y_train))
        n_samples, d = X_train.shape

        self.class_priors = np.zeros(self.n_classes)
        class_count = np.zeros(self.n_classes)

        for label in y_train:
            self.class_priors[label] += 1

        for idx, count in enumerate(self.class_priors):
            class_count[idx] = count
            self.class_priors[idx] /= n_samples

        self.feature_likelihoods = np.zeros((d, self.n_classes))

        for class_ in range(self.n_classes):
            class_indices = np.where(y_train == class_)[0]
            word_count_class = np.sum(X_train[class_indices], axis=0)

            self.feature_likelihoods[:, class_] = (word_count_class + 1) / (class_count[class_] + d)  # +d for vocabulary size
            self.feature_likelihoods[:, class_] = np.maximum(self.feature_likelihoods[:, class_], EPSILON)  # Avoid zero probabilities

        return self

    def predict_prob(self, X_test):
        X_test = np.array(X_test)
        n_samples, d = X_test.shape
        y_pred = []

        for i in range(n_samples):
            log_probs = []
            for label in range(self.n_classes):
                log_prior = np.log(self.class_priors[label])
                log_likelihood = np.sum(
                    X_test[i] * np.log(self.feature_likelihoods[:, label]) +
                    (1 - X_test[i]) * np.log(1 - self.feature_likelihoods[:, label])
                )

                log_likelihood = np.maximum(log_likelihood, EPSILON)
                log_probs.append(log_prior + log_likelihood)

            y_pred.append(np.argmax(log_probs))

        return y_pred


In [215]:
import pandas as pd

In [216]:
import kagglehub

# download latest version
path = kagglehub.dataset_download("venky73/spam-mails-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/venky73/spam-mails-dataset/versions/1


In [217]:
import os
print(os.listdir(path))

['spam_ham_dataset.csv']


In [218]:
data = pd.read_csv(os.path.join(path, "spam_ham_dataset.csv"))

In [219]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [220]:
data.shape

(5171, 4)

In [221]:
data.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
label,0
text,0
label_num,0


In [222]:
data['text']

Unnamed: 0,text
0,Subject: enron methanol ; meter # : 988291\r\n...
1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,"Subject: photoshop , windows , office . cheap ..."
4,Subject: re : indian springs\r\nthis deal is t...
...,...
5166,Subject: put the 10 on the ft\r\nthe transport...
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,Subject: calpine daily gas nomination\r\n>\r\n...
5169,Subject: industrial worksheets for august 2000...


In [223]:
data[data['label_num'] == 1] # spam mails

Unnamed: 0.1,Unnamed: 0,label,text,label_num
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
7,4185,spam,Subject: looking for medication ? we ` re the ...,1
10,4922,spam,Subject: vocable % rnd - word asceticism\r\nvc...,1
11,3799,spam,Subject: report 01405 !\r\nwffur attion brom e...,1
13,3948,spam,Subject: vic . odin n ^ ow\r\nberne hotbox car...,1
...,...,...,...,...
5159,4381,spam,Subject: pictures\r\nstreamlined denizen ajar ...,1
5161,4979,spam,Subject: penny stocks are about timing\r\nnoma...,1
5162,4162,spam,Subject: anomaly boys from 3881\r\nuosda apapr...,1
5164,4365,spam,Subject: slutty milf wants to meet you\r\ntake...,1


In [224]:
data[data['label_num'] == 0] # non-spam mails

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
...,...,...,...,...
5165,2849,ham,"Subject: fw : crosstex energy , driscoll ranch...",0
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0


In [225]:
data.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [226]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [227]:
rand = np.random.randint(0, 5721)
rand

173

In [228]:
import numpy as np

temp_mails = []

for i in range(5):
  rand = np.random.randint(0, 5721)
  if rand != 0:
    rand -= 1

  print(data['text'].iloc[rand])
  print("================================================================================================")
  # np.append(temp_mails, data['text'].iloc[rand])
  temp_mails.append(data['text'].iloc[rand])
  # print(data['label_num'].iloc[rand]

# data['text'].iloc[4]

Subject: meter # 0989814 - gmt , inc . - 1 / 00 production
stephanie ,
the problem with the january payment is that the deal is set up incorrectly
in sitara . for january production deal 153864 shows hplc purchasing gas
from gmt company inc , not gmt inc . someone needs to either correct the deal
for 1 / 00 or put in a new deal , then i can do an accounting arrangement to
make the necessary correction . it is correct for 2 / 00 on deal 156292 . i
emailed daren on this because i see his name in sitara as the contact , but i
am not sure who actually takes care of it .
- - - - - - - - - - - - - - - - - - - - - - forwarded by sherlyn schumack / hou / ect on 03 / 22 / 2000
10 : 27 am - - - - - - - - - - - - - - - - - - - - - - - - - - -
from : thu nguyen 03 / 20 / 2000 03 : 07 pm
to : sherlyn schumack / hou / ect @ ect
cc : stephanie gomes / hou / ect @ ect
subject : meter # 0989814 - gmt , inc . - 1 / 00 production
sherlyn ,
i believe this is your meter . . . . . . . .
- -

In [229]:
# preparing the data pipeline for word preprocessing.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [230]:
temp_mails[0]



In [231]:
print(temp_mails[0])

Subject: meter # 0989814 - gmt , inc . - 1 / 00 production
stephanie ,
the problem with the january payment is that the deal is set up incorrectly
in sitara . for january production deal 153864 shows hplc purchasing gas
from gmt company inc , not gmt inc . someone needs to either correct the deal
for 1 / 00 or put in a new deal , then i can do an accounting arrangement to
make the necessary correction . it is correct for 2 / 00 on deal 156292 . i
emailed daren on this because i see his name in sitara as the contact , but i
am not sure who actually takes care of it .
- - - - - - - - - - - - - - - - - - - - - - forwarded by sherlyn schumack / hou / ect on 03 / 22 / 2000
10 : 27 am - - - - - - - - - - - - - - - - - - - - - - - - - - -
from : thu nguyen 03 / 20 / 2000 03 : 07 pm
to : sherlyn schumack / hou / ect @ ect
cc : stephanie gomes / hou / ect @ ect
subject : meter # 0989814 - gmt , inc . - 1 / 00 production
sherlyn ,
i believe this is your meter . . . . . . . .
- -

In [232]:
import re
regex = '\b[a-zA-Z]+\b'

text = temp_mails[0]
text = text.lower()
words = re.findall(r"\b[a-z]+\b", text, re.IGNORECASE)

In [233]:
words

['subject',
 'meter',
 'gmt',
 'inc',
 'production',
 'stephanie',
 'the',
 'problem',
 'with',
 'the',
 'january',
 'payment',
 'is',
 'that',
 'the',
 'deal',
 'is',
 'set',
 'up',
 'incorrectly',
 'in',
 'sitara',
 'for',
 'january',
 'production',
 'deal',
 'shows',
 'hplc',
 'purchasing',
 'gas',
 'from',
 'gmt',
 'company',
 'inc',
 'not',
 'gmt',
 'inc',
 'someone',
 'needs',
 'to',
 'either',
 'correct',
 'the',
 'deal',
 'for',
 'or',
 'put',
 'in',
 'a',
 'new',
 'deal',
 'then',
 'i',
 'can',
 'do',
 'an',
 'accounting',
 'arrangement',
 'to',
 'make',
 'the',
 'necessary',
 'correction',
 'it',
 'is',
 'correct',
 'for',
 'on',
 'deal',
 'i',
 'emailed',
 'daren',
 'on',
 'this',
 'because',
 'i',
 'see',
 'his',
 'name',
 'in',
 'sitara',
 'as',
 'the',
 'contact',
 'but',
 'i',
 'am',
 'not',
 'sure',
 'who',
 'actually',
 'takes',
 'care',
 'of',
 'it',
 'forwarded',
 'by',
 'sherlyn',
 'schumack',
 'hou',
 'ect',
 'on',
 'am',
 'from',
 'thu',
 'nguyen',
 'pm',
 'to',
 

# Using Count Vectoriser for Encoding

In [234]:
from sklearn.pipeline import make_pipeline

In [235]:
vectorizer = CountVectorizer()
vectorizer.fit(words)
print(f"Vocabulary: {vectorizer.vocabulary_}")
vector = vectorizer.transform([text])
print(vector.toarray())

[[ 1  1  1  2  1  1  1  1  1  3  1 11  4  1  1  8  1  2  4  1  6  4  1  1
   2  1  3  1  1  2  1  1  2  1  8  2 11  4  1  1  1 11  1  1  2  1  1  2
   7  2  1  9  2  3  1  1  3  5  1  6  1  2  1  1  2  1  1 55  2  1  1  1
   1 18  1  1  1  1  1 13  1 11 10  5  1  2 18 29  3  4  1  4  5  1  3  4
   1  1  1  1  1  1 31  3  1  5  6 20  1  1  1  1  1  1  1 10  1  9  1  1
   4  3  1  1  2  1  1  1  1  1  2  1  3  4 12  2  1  2  1  3  7  1  2  2
   2  1  2  3  9  1  4  7  2  4  1 17  1  1  3  1  1  2  1  1  1  3  1  1
   1  1  8 12  1  1  1  1  8  1  1  1  1  9  1  1  1  1  1  6  4  2  2  1
   1  1  1  3  3  1  1  4  2  1  1  1  1  6 12  2  1  1  2  7 30  1  3  1
   2  1 11  1  4 10 31  1  1  1  1  1  1  1  1  1  1  1  2  1  1  2  1  7
   1  7 13  4  2  1  1  4  1  3  1  1 12  4]]


In [236]:
import re
regex = '\b[a-zA-Z]+\b'

text = temp_mails[0]
words = re.findall(r"\b[a-z]+\b", text, re.IGNORECASE)

def clean_text(text):
  text = text.lower()
  text = re.findall(r"\b[a-z]+\b", text, re.IGNORECASE)
  text = " ".join(text)
  return text

clean_text(text)



In [237]:
document = []
for i in range(data.shape[0]):
  text = data['text'].iloc[i]
  text = clean_text(text)
  document.append(text)

In [238]:
# split the dataset
from sklearn.model_selection import train_test_split

y = data['label_num']
X_train, X_test, y_train, y_test = train_test_split(document, y, test_size=0.2, random_state=42)

In [239]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

pipeline = Pipeline([
    ('cleaner', FunctionTransformer(lambda x: [clean_text(doc) for doc in document])),
    ('vectoriser', CountVectorizer()),
])

In [240]:
temp_set = X_train[:3]

In [241]:
# pipeline.fit_transform(temp_set)

v = CountVectorizer()
v.fit(temp_set)
temp_set = v.transform(temp_set)

In [242]:
temp_set.toarray()

array([[ 1,  2,  0,  0,  1,  0,  2,  0,  3,  2,  0,  2,  2,  0,  1,  0,
         1,  0,  2,  0,  1,  0,  1,  0,  1,  0,  1,  1,  1,  0,  2,  1,
         0,  3,  1,  1,  1,  1,  1,  0,  3,  0,  4,  0,  0,  2,  0,  0,
         0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  0,  3,  1,  1,  0,  0,
         0,  0,  0,  1,  0,  0,  1,  1,  1,  1,  0,  1,  4,  0,  0,  4,
         0,  1,  0,  2,  0,  1,  2,  0,  0,  1,  0,  1,  0,  2,  4,  0,
         0,  1,  1,  0,  1,  1,  1,  2,  1,  0,  0,  0,  0,  0,  1,  1,
         0,  1,  0,  0,  0,  0,  1,  1,  1,  0,  1,  0,  4,  1,  1,  0,
         1,  1,  1,  1,  1, 10,  3,  0,  1,  1,  6,  0,  0,  0,  0,  0,
         1,  0,  0,  1,  0,  0,  0,  0,  1,  0,  0,  1,  0,  1,  2,  0,
         1,  0,  2,  0,  8],
       [ 0,  0,  0,  1,  0,  3,  3,  0,  0,  0,  1,  0,  0,  0,  0,  2,
         0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,
         1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  0,  0,
         0,  0,  1,  1,  0,  2,  1,

In [243]:
y_train[:3]

Unnamed: 0,label_num
5132,0
2067,1
4716,0


In [244]:
clf = NaiveBayesSpamClassifier()
clf.fit(temp_set.toarray(), y_train[:3])

<__main__.NaiveBayesSpamClassifier at 0x7ec04c7632b0>

In [245]:
len(X_train[:3][2])

296

In [246]:
clf.predict(temp_set.toarray())

[0, 0, 0]

In [250]:
cv = CountVectorizer()
cv.fit(X_train)
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [251]:
y_train

Unnamed: 0,label_num
5132,0
2067,1
4716,0
4710,0
2268,1
...,...
4426,0
466,0
3092,1
3772,0


In [252]:
clf.fit(X_train.toarray(), y_train)

<__main__.NaiveBayesSpamClassifier at 0x7ec04c7632b0>

In [253]:
y_pred = clf.predict(X_test.toarray())

In [255]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"accuracy: {accuracy}")
print(f"precision: {precision}")

accuracy: 0.7169082125603865
precision: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [256]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[742,   0],
       [293,   0]])

In [258]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [259]:
y_pred = clf.predict(X_test)

In [261]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall= recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f"accuracy: {accuracy}")
print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"f1_score: {f1_score}")

accuracy: 0.9758454106280193
precision: 0.9685314685314685
recall: 0.9453924914675768
f1_score: 0.9568221070811744


# The custom made model is underperforming gotta work it out.