# import lib

In [86]:
import os
import io
from click import Path
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import tarfile
import email
import email.policy
import email.parser
import string
import urllib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import urllib.request
import tarfile
from pathlib import Path
from nltk.corpus import stopwords
nltk.download('stopwords')
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin
import urlextract
import re
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
_url = 'https://spamassassin.apache.org/old/publiccorpus/'



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load and fetch the data

In [2]:
def fetch_spam_data():

    ham_url = _url + "20030228_easy_ham.tar.bz2"
    spam_url = _url + "20030228_spam.tar.bz2"

    spam_path = Path(".") / "datasets" / "spam"  # Corrected line
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

ham_dir, spam_dir = fetch_spam_data()

Downloading datasets/spam/ham.tar.bz2
Downloading datasets/spam/spam.tar.bz2


In [21]:
def load_email(fs):
  with open(fs, 'rb') as f:
    return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(f) for f in ham_dir.iterdir()]
spam_emails = [load_email(f) for f in spam_dir.iterdir()]

In [22]:
print(ham_emails[1].get_content().strip())
print(spam_emails[6].get_content().strip())

use Perl Daily Headline Mailer

Using Web Services with Perl and AppleScript
    posted by pudge on Wednesday September 25, @08:12 (links)
    http://use.perl.org/article.pl?sid=02/09/25/129231




Copyright 1997-2002 pudge.  All rights reserved.



You have received this message because you subscribed to it
on use Perl.  To stop receiving this and other
messages from use Perl, or to add more messages
or change your preferences, please go to your user page.

	http://use.perl.org/my/messages/

You can log in and change your preferences from there.
<html><head></head><body bgcolor=black>
<table border=0 cellspacing=0 cellpadding=5 align=center><tr><th bgcolor="#8FB3C5">
<table border=0 cellspacing=0 cellpadding=5 align=center><tr><th bgcolor="#000000">
<table border=0 cellspacing=0 cellpadding=0 align=center>
<tr>
<th><a href="http://psychicrevenue.com/cgi-bin/refer.cgi?pws01014&site=pw">
<img src="http://giftedpsychic.com/images/r1c1.jpg" width=279 height=286 border=0></a></th>
<th><a h

# Prep the Data

In [81]:
# split the data into training and testing sets
X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)  # 80% training, 20% testing
len(y_train)

2401

In [39]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    if isinstance(email, list):
        # If the email is a list of email objects, process each email recursively and join the results
        return ", ".join([get_email_structure(sub_email) for sub_email in email])
    payload = email.get_payload()
    if isinstance(payload, list):
        # If the payload is a list (e.g. multipart), process each part recursively and join the results
        return ", ".join([get_email_structure(sub_email) for sub_email in payload])
    return email.get_content_type()

In [44]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return BeautifulSoup(html, "lxml").text

In [56]:
url_extractor = urlextract.URLExtract()
stemmer = nltk.PorterStemmer()
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True,
                 remove_punctuation=True, replace_urls=True,
                 replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [57]:
X_few = x_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'number': 19, 'the': 9, 'is': 8, 'so': 7, 'rpm': 5, 'libvorbisfil': 5, 'thi': 5, 'on': 4, 'libvorbi': 4, 'you': 4, 'and': 3, 'need': 3, 'by': 3, 'a': 3, 'one': 3, 'linux': 3, 'at': 2, 'i': 2, 'm': 2, 'vorbi': 2, 'my': 2, 'depend': 2, 'new': 2, 'onli': 2, 'version': 2, 'url': 2, 'can': 2, 'to': 2, 'then': 2, 'do': 2, 'instal': 2, 'ie': 2, 'sat': 1, 'oct': 1, 'numberpm': 1, 'padraig': 1, 'bradi': 1, 'mention': 1, 'ok': 1, 'upgrad': 1, 'machin': 1, 'get': 1, 'follow': 1, 'u': 1, 'tool': 1, 'numberinumberrpm': 1, 'error': 1, 'fail': 1, 'sdl_mixer': 1, 'xmm': 1, 'tuxrac': 1, 'becaus': 1, 'ha': 1, 'problem': 1, 'in': 1, 'other': 1, 'packag': 1, 'specif': 1, 'rather': 1, 'than': 1, 'gener': 1, 'pain': 1, 'way': 1, 'resolv': 1, 'knowledg': 1, 'download': 1, 'origin': 1, 'remov': 1, 'old': 1, 'uvh': 1, 'assum': 1, 'that': 1, 'want': 1, 'both': 1, 'same': 1, 'time': 1, 'doe': 1, 'whi': 1, 't': 1, 'after': 1, 'have': 1, 'librari': 1, 'alreadi': 1, 'beyond': 1, 'me': 1, 'kate': 1, 

In [67]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1
                            for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)),
                          shape=(len(X), self.vocabulary_size + 1))

In [68]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount",EmailToWordCounterTransformer()),
    ("wordcount_to_vector",WordCounterToVectorTransformer(1000))
])
X_train_transformed = preprocess_pipeline.fit_transform(x_train)

# Models

## 1. naive bayes classifier

In [83]:
# let's train the model
model = MultinomialNB()
model.fit(X_train_transformed,y_train)

# let's evaluate the model
X_test_transformed = preprocess_pipeline.transform(x_test)
y_pred = model.predict(X_test_transformed)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       497
           1       0.96      0.94      0.95       104

    accuracy                           0.98       601
   macro avg       0.97      0.97      0.97       601
weighted avg       0.98      0.98      0.98       601



## 2. Random Forest

In [87]:
model = RandomForestClassifier()
model.fit(X_train_transformed,y_train)
y_pred = model.predict(X_test_transformed)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       497
           1       1.00      0.90      0.95       104

    accuracy                           0.98       601
   macro avg       0.99      0.95      0.97       601
weighted avg       0.98      0.98      0.98       601

