In [1]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string
import re

Load and filter SpamAssasin dataset
(link: https://www.kaggle.com/datasets/veleon/ham-and-spam-dataset)

In [2]:
def get_body(mail):
    if mail.is_multipart():
        for part in mail.get_payload():
            if part.is_multipart():
                return get_body(part)
            body = part.get_payload()
            if part.get_content_type() == "text/plain":
                return body
    else:
        return mail.get_payload()

In [3]:
import os
import email
hams_l = os.listdir('./hamnspam/ham')
hams = []
subjects = []
for f in hams_l:
    raw_mail = open(f'./hamnspam/ham/{f}', 'r').read()
    mail = email.message_from_string(raw_mail)
#     text = mail.get_payload()
    text = get_body(mail)
    text = text.lower()
    for c in string.punctuation+'0123456789\n':
        text = text.replace(c, ' ')
    for w in stopwords.words('english'):
        text = text.replace(' '+w+' ', ' ')
    tokens = re.split(' +', text)
    while '' in tokens:
        tokens.pop(tokens.index(''))
    hams.append(' '.join(tokens))
    text = re.search('Subject: *(\[.*\])* *(R[eE]:)* *(\[.*\])* *(.*)\n', raw_mail).groups()[-1]
    text = text.lower()
    for c in string.punctuation+'0123456789\n':
        text = text.replace(c, ' ')
    for w in stopwords.words('english'):
        text = text.replace(' '+w+' ', ' ')
    tokens = re.split(' +', text)
    subjects.append(' '.join(tokens))

In [4]:
hams_l = os.listdir('./hamnspam/spam')
spams = []
subjects_s = []
for f in hams_l:
    try:
        raw_mail = open(f'./hamnspam/spam/{f}', 'r').read()
    except:
        continue
    mail = email.message_from_string(raw_mail)
#     text = mail.get_payload()
    text = get_body(mail)
    if text == None:
        continue
    text = text.lower()
    for c in string.punctuation+'0123456789\n':
        text = text.replace(c, ' ')
    for w in stopwords.words('english'):
        text = text.replace(' '+w+' ', ' ')
    tokens = re.split(' +', text)
    while '' in tokens:
        tokens.pop(tokens.index(''))
    spams.append(' '.join(tokens))
    m = re.search('Subject: *(\[.*\])* *(R[eE]:)* *(\[.*\])* *(.*)\n', raw_mail)
    if m != None:
        text = m.groups()[-1]
    else:
        text = ' '
    text = text.lower()
    for c in string.punctuation+'0123456789\n':
        text = text.replace(c, ' ')
    for w in stopwords.words('english'):
        text = text.replace(' '+w+' ', ' ')
    tokens = re.split(' +', text)
    subjects_s.append(' '.join(tokens))

In [5]:
df = pd.DataFrame(data={'subject':subjects+subjects_s, 'text':hams+spams, 'class':[0]*len(hams)+[1]*len(spams)})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021 entries, 0 to 3020
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  3021 non-null   object
 1   text     3021 non-null   object
 2   class    3021 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 70.9+ KB


In [7]:
print(f'spam: {len(df[df["class"]==1])}')
print(f'not spam: {len(df[df["class"]==0])}')

spam: 470
not spam: 2551


In [8]:
df = df.sample(frac=1)
df

Unnamed: 0,subject,text,class
2536,good earnings report,url http jeremy zawodny com blog archives html...,0
325,interesting article free software licences,translated article full see end post think don...,0
317,the absurdities life,so given apparent commonality occurances compa...,0
174,man kills self home booby traps,url http boingboing net date supplied steve se...,0
2786,from desk george osawa,nigeria electirc power authority federal secre...,1
...,...,...,...
2259,the ozone hole getting smaller leroy getting l...,url http www newsisfree com click date img htt...,0
298,save planet kill people,martin yes confirming said last message ah see...,0
416,another low probability event,we met family parent baby group son born minut...,0
2605,best life insurance lowest cost nticy,html head head body center font face times siz...,1


Vectorization:

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
tfidf_vectorizer = TfidfVectorizer(max_df=500, min_df=10)
tfidf_text =  tfidf_vectorizer.fit_transform(df['text'])
tfidf_subject = tfidf_vectorizer.transform(df['subject'])
x = sparse.hstack([tfidf_text, tfidf_subject])

In [10]:
x.shape

(3021, 9462)

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, df['class'], test_size=0.2)

Naive Bayes Classifier

In [12]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

In [13]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(x_test)
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       505
           1       1.00      0.85      0.92       100

    accuracy                           0.98       605
   macro avg       0.99      0.93      0.95       605
weighted avg       0.98      0.98      0.97       605

[[505   0]
 [ 15  85]]
0.9752066115702479


KNN Classifier

In [14]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

In [15]:
pred = classifier.predict(x_test)
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96       505
           1       0.98      0.59      0.74       100

    accuracy                           0.93       605
   macro avg       0.95      0.79      0.85       605
weighted avg       0.93      0.93      0.92       605

[[504   1]
 [ 41  59]]
0.9305785123966942
