# Download files

In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
HARD_HAM_URL = DOWNLOAD_ROOT + "20021010_hard_ham.tar.bz2"
HAM_2_URL = DOWNLOAD_ROOT + "20030228_easy_ham_2.tar.bz2"
SPAM_2_URL = DOWNLOAD_ROOT + "20030228_spam_2.tar.bz2"
SPAM_PATH = os.path.join("datasets", "full_spam")

URLS = (
    ("ham.tar.bz2", HAM_URL, "ham"), 
    ("spam.tar.bz2", SPAM_URL, "spam"),
    ("spam2.tar.bz2", SPAM_2_URL, "spam"),
    ("hard_ham.tar.bz2", HARD_HAM_URL, "ham"),
    ("ham2.tar.bz2", HAM_2_URL, "ham")
)

def fetch_spam_data(urls=URLS, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)

    for filename, url, category in urls:
        category_path = os.path.join(spam_path, category)
        if not os.path.isdir(category_path):
            os.makedirs(category_path) 

        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path) 

        with tarfile.open(path) as tar_bz2_file:
            for member in tar_bz2_file.getmembers():
                # Remove subdiretórios e extrai apenas arquivos
                if member.isfile():  
                    member.name = os.path.basename(member.name)  # Remove caminhos extras
                    tar_bz2_file.extract(member, path=category_path)

In [2]:
fetch_spam_data()

In [3]:
HAM_DIR = os.path.join(SPAM_PATH, "ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [4]:
len(ham_filenames)

4150

In [5]:
len(spam_filenames)

1897

# Parsing the emails

In [6]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        # Retorna um objeto EmailMessage
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [8]:
ham_emails[0].keys()

['Return-Path',
 'Delivered-To',
 'Received',
 'Received',
 'Received',
 'Received',
 'Delivered-To',
 'Received',
 'Received',
 'Received',
 'Received',
 'Received',
 'Received',
 'From',
 'To',
 'Cc',
 'Subject',
 'In-Reply-To',
 'References',
 'MIME-Version',
 'Content-Type',
 'Message-Id',
 'X-Loop',
 'Sender',
 'Errors-To',
 'X-Beenthere',
 'X-Mailman-Version',
 'Precedence',
 'List-Help',
 'List-Post',
 'List-Subscribe',
 'List-Id',
 'List-Unsubscribe',
 'List-Archive',
 'Date']

In [9]:
print(ham_emails[3000].get_content().strip())

On Wednesday 11 September 2002 16:19 CET Justin Mason wrote:
> Malte S. Stretz said:
>[...]
> > I think we should even add new (GA'd) rules to 2.4x (and/or remove old
> > ones) and tag a new 2.50 only if we have a bunch of features worth a
> > "dangerous" big update. I'd say: Yes, you should expect 2.42 and also
> > 2.43+ (but update to 2.41 now).
>
> I would think adding new rules to, or removing broken rules from, 2.4x
> would require some discussion first.  but new GA'd scores are definitely
> worth putting in, as the ones there are too wild.

I think my mail wasn't very clear ;-) My point was that we should continue 
releasing new rules and removing broken ones (all based on discussions on 
this list of course) in the 2.4 branch instead of creating a new 2.5 branch 
everytime we have a bunch of new rules.

A new branch should be openend only if (big) new features are introduced 
(eg. Bayes) or the interface has changed (spam_level_char=x). As the rules 
are under fluent development

In [10]:
print(spam_emails[900].get_content().strip())

<HTML>
<BODY>
</head>

<body bgcolor="#FFFFFF">
<table width="62%" height="218">
  <tr valign="top"> 
    <td height="260"> 
      <p><font size="3"><b>Fortunes are literally being made in this great new 
        marketplace!</b></font></p>
      <p>O<font size="3"></font><font size="3">ver <b>$9 Billion</b> in merchandise 
        was sold on <b>eBay</b> in 2001 by people just like you - <u>right from 
        their homes</u>! </font></p>
      <p><font size="3">Now you too can learn the secrets of <b>successful selling 
        </b>on<b> eBay</b> and <b>make a staggering income</b> from the comfort 
        of <b>your own home</b>. If you are <b>motivated</b>, capable of having 
        an<b> open mind</b>, and can follow simple directions, then <a href="http://www.generaledu.com/ebayepubs">visit 
        us here.</a> If server busy - <a href="http://168.75.161.164/ebayepubs/">alternate.</a></font></p>
      <p><font size="2"> <font size="1">We are strongly against sending unsolicite

## Multipart Emails

Some emails are multipart, which means they have different sections with different kinds of content (plain text, images, HTML...)

In [11]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        # if multipart, returns a list of parts
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [12]:
from collections import Counter

# Stores the frequency of each type of email in a dictionary
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [13]:
structures_counter(ham_emails).most_common()

[('text/plain', 3837),
 ('text/html', 122),
 ('multipart(text/plain, application/pgp-signature)', 101),
 ('multipart(text/plain, text/html)', 58),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 2),
 ('multipart(text/plain, multipart(text/plain))', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  2),
 ('multipart(text/plain, application/x-pkcs7-signature)', 2),
 ('multipart(text/html)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/x-patch)', 1),
 ('multipart(multipart(text/plain, multipart(text/plain), text/plain), application/pgp-signature)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/gif, image/gif, image/gif, image/gif)',
  1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(

In [14]:
structures_counter(spam_emails).most_common()

[('text/plain', 816),
 ('text/html', 772),
 ('multipart(text/plain, text/html)', 159),
 ('multipart(text/html)', 49),
 ('multipart(text/plain)', 44),
 ('multipart(multipart(text/html))', 23),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 3),
 ('multipart(text/html, text/plain)', 3),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart/alternative', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/gif)',
  1),
 ('text/plain charset=us-ascii', 1),
 ('multipart(multipart(text

It is noticeable that the majority of non-spam emails consist solely of text, while spam emails contain a significant amount of HTML structures. Additionally, the "application/octet-stream" structure (used when the email server or client cannot precisely determine the file type) is more common in spam emails.

## Email Headers

In [15]:
for header, value in spam_emails[0].items():
    print(header, ":",value)

Return-Path : <ilug-admin@linux.ie>
Delivered-To : yyyy@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received : from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
Received : from 64.0.57.142 [202.63.165.34] by bettyjagessa

In [16]:
for header, value in ham_emails[0].items():
    print(header, ":",value)

Return-Path : <exmh-workers-admin@spamassassin.taint.org>
Delivered-To : yyyy@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 7106643C34	for <jm@localhost>; Wed, 21 Aug 2002 08:33:03 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Wed, 21 Aug 2002 13:33:03 +0100 (IST)
Received : from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7LCXvZ24654 for    <jm-exmh@jmason.org>; Wed, 21 Aug 2002 13:33:57 +0100
Received : from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id F12A13EA25; Wed, 21 Aug 2002    08:34:00 -0400 (EDT)
Delivered-To : exmh-workers@listman.spamassassin.taint.org
Received : from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org    [172.16.52.254]

In [17]:
spam_emails[0]["Subject"]

'[ILUG] STOP THE MLM INSANITY'

## Spliting training set and test set

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0]* len(ham_emails) + [1]*len(spam_emails))

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing Functions

First, we need to convert HTML to plain text. Here we can use html2text:

In [19]:
html_spam_emails = [email for email in X_train[y_train==1]
                   if get_email_structure(email) == "text/html"]
                    
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html>
<body>
<p align="center"><br>
<b><font size="6" face="Tahoma" color="#000080">
Norton SystemWorks 2002<br>Software Suite<br>
Professional Edition</font><br><br>
</b><font face="Arial"><b><font size="5"><font color="#FF0000">
6 Feature-Packed Utilities,</font> 1 Great Price</font><br>
<font size="4">A $300.00+ Combined Retail Value for <font color="#FF0000"> Only $29.99!</font></font><br><br>
<font size="3"><span style="background-color: #FFFF00">
Protect your computer and your valuable information!<br><br>
Don't allow yourself to fall prey to destructive viruses!</span><br><br>
<a href="http://www.1800mailman.com/software/sw.htm">CLICK HERE FOR MORE INFO AND TO ORDER</a></font></b><br><br>
<font size="2">If you wish to unsubscribe from this list, please <a href="http://www.1800mailman.com/removeme.html"> Click Here</a> to be removed.</font></font></p>
</body>
</html> ...


In [20]:
from html2text import html2text
print(html2text(sample_html_spam.get_content())[:1000], "...")

  
**Norton SystemWorks 2002  
Software Suite  
Professional Edition  
  
****6 Feature-Packed Utilities, 1 Great Price  
A $300.00+ Combined Retail Value for  Only $29.99!  
  
Protect your computer and your valuable information!  
  
Don't allow yourself to fall prey to destructive viruses!  
  
[CLICK HERE FOR MORE INFO AND TO
ORDER](http://www.1800mailman.com/software/sw.htm)**  
  
If you wish to unsubscribe from this list, please [ Click
Here](http://www.1800mailman.com/removeme.html) to be removed.

 ...


In [21]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html2text(html)

In [22]:
print(email_to_text(sample_html_spam)[:100])

  
**Norton SystemWorks 2002  
Software Suite  
Professional Edition  
  
****6 Feature-Packed Utili


## Stemming

In [23]:
import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, "=>", stemmer.stem(word))


Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


We also need to replace URLs with the word "URL", for this we will use urlextract library

In [24]:
import urlextract

url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


## Word Counters

Now we can put all this together intro a transformer that we will use to convert emails to word counters.

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
import re

# Custom Transformer
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    # Fit does nothing, since this transformer doesn't learn anything from the data
    def fit(self, X, y=None):
        return self
    
    # Get word count
    def transform(self, X, y=None):
        X_transformed = []
        
        for email in X:
            
            # Email to text
            text = email_to_text(email) or ""
            
            # Uppercase to Lowercase
            if self.lower_case:
                text = text.lower()
                
            # Replace URLs
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            
            # Replace numbers
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            
            # Replace punctuation
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            
            # Count words
            word_counts = Counter(text.split())
            
            # Stemming
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [26]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'number': 9, 'and': 6, 'the': 5, 'signal': 4, 'to': 4, 'our': 4, 'advanc': 3, 'cell': 3, 'phone': 3, 'free': 3, 'of': 3, 'each': 3, 'your': 3, 'click': 3, 'world': 2, 'most': 2, 'booster': 2, 'hand': 2, 'headset': 2, 'super': 2, 'is': 2, 'technolog': 2, 'antenna': 2, 'now': 2, 'from': 2, 'purchas': 2, 'in': 2, 'less': 2, 'call': 2, 'ani': 2, 'or': 2, 'on': 2, 'at': 2, 'if': 2, 'you': 2, 'web': 2, 's': 1, 'intern': 1, 'adapt': 1, 'all': 1, 'make': 1, 'model': 1, 'mobil': 1, 'normal': 1, 'price': 1, 'base': 1, 'upon': 1, 'unit': 1, 'bonu': 1, 'radiat': 1, 'shield': 1, 'with': 1, 'util': 1, 'same': 1, 'aerospac': 1, 'use': 1, 'by': 1, 'state': 1, 'art': 1, 'satellit': 1, 'dish': 1, 'receiv': 1, 'which': 1, 'result': 1, 'drop': 1, 'dead': 1, 'zone': 1, 'improv': 1, 'recept': 1, 'build': 1, 'elev': 1, 'hallway': 1, 'tunnel': 1, 'mountain': 1, 'place': 1, 'where': 1, 'a': 1, 'weak': 1, 'don': 1, 't': 1, 'miss': 1, 'that': 1, 'urgent': 1, 'love': 1, 'one': 1, 'dure': 1, 'an': 

## Convert Counters to Vectors

In [27]:
from scipy.sparse import csr_matrix

from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
        
    # Learn which words are more common
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items(): 
                total_count[word] += min(count, 10) # Limit the maximum count to 10 to prevent highly frequent words from dominating the count
        # Get the most common words
        most_common = total_count.most_common()[:self.vocabulary_size]
        # Creates dictionary that associates words to numerical indices starting at 1
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        
        return self
    
    # Sparse Matriz
    # Columns: Email, 0, 1, 2, ... n (0 = unknown word)
    # each row contains the number of the email and the frequency of each word
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [28]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 30 stored elements and shape (3, 11)>

In [29]:
X_few_vectors.toarray()

array([[165,   6,   9,   4,   5,   3,   2,   1,   3,   2,   0],
       [325,   6,  11,  11,   7,   0,   1,   8,   3,   6,  10],
       [227,  11,   2,   5,   5,  13,  12,   3,   5,   2,   0]])

In [30]:
vocab_transformer.vocabulary_

{'and': 1,
 'number': 2,
 'to': 3,
 'the': 4,
 'your': 5,
 'you': 6,
 'a': 7,
 'of': 8,
 'on': 9,
 'i': 10}

In [31]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.950) total time=   0.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] END ................................ score: (test=0.970) total time=   0.2s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] END ................................ score: (test=0.965) total time=   0.3s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s finished


0.9619929734908975

In [33]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 94.34%
Recall: 93.28%


In [35]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred)) # spam 1 ham 0

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3319
           1       0.94      0.93      0.94      1518

    accuracy                           0.96      4837
   macro avg       0.96      0.95      0.95      4837
weighted avg       0.96      0.96      0.96      4837



Using only the easy ham dataset it was possible to achieve
Precision: 95.88%
Recall: 97.89%
    
It seems precision has not decreased that much. On the other hand, recall has dropped a little bit more, which makes sense, since now we have more examples of ham emails that are closer to spam emails, making our model predict more spam emails as ham emails.

# Testing other models

In [36]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
import pandas as pd

clf_models = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(max_iter=1000, random_state=42),
    linear_model.RidgeClassifierCV(),
    linear_model.Perceptron(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    
    #Trees    
    tree.DecisionTreeClassifier() 
    ]

columns = ['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
df_metrics = pd.DataFrame(columns = columns)

for clf in clf_models:
    try:
        clf.fit(X_train_transformed, y_train)
        y_pred = clf.predict(X_test_transformed)
        y_prob = clf.predict_proba(X_test_transformed)[:, 1] if hasattr(clf, "predict_proba") else None
        name = clf.__class__.__name__
        
        metrics = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Balanced Accuracy': balanced_accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred),
            'AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else None
        }
        
        df_metrics = pd.concat([df_metrics, pd.DataFrame(metrics, index=[name])])
    
    except Exception as e:
        print(f"Erro ao treinar {clf.__class__.__name__}: {e}")

# Exibindo os resultados
df_metrics

Unnamed: 0,Accuracy,Balanced Accuracy,Precision,Recall,F1-Score,AUC
AdaBoostClassifier,0.939839,0.919343,0.939155,0.864295,0.900172,0.982633
BaggingClassifier,0.927228,0.911404,0.89606,0.868906,0.882274,0.966488
ExtraTreesClassifier,0.966922,0.957487,0.961277,0.932148,0.946488,0.993387
GradientBoostingClassifier,0.951623,0.935436,0.950843,0.891963,0.920462,0.988188
RandomForestClassifier,0.960513,0.94817,0.957271,0.91502,0.935669,0.992854
LogisticRegressionCV,0.961133,0.951303,0.949932,0.924901,0.93725,0.981345
RidgeClassifierCV,0.908414,0.890546,0.862441,0.842556,0.852383,
Perceptron,0.831921,0.843028,0.681234,0.872859,0.765232,
KNeighborsClassifier,0.853628,0.787353,0.889423,0.609354,0.723221,0.909979
SVC,0.737441,0.586691,0.907895,0.181818,0.302964,0.929313


In [37]:
def highlight(s):
  is_max = s == s.max()
  is_second_max = s == s.nlargest(2).iloc[-1]
  is_third_max = s == s.nlargest(3).iloc[-1]

  return ['background-color: #33ff33' if is_max.iloc[i] else
        'background-color: #ffff33' if is_second_max.iloc[i] else
        'background-color: #ff6961' if is_third_max.iloc[i] else
        '' for i in range(len(s))]

df_metrics = df_metrics.style.apply(highlight)
df_metrics

Unnamed: 0,Accuracy,Balanced Accuracy,Precision,Recall,F1-Score,AUC
AdaBoostClassifier,0.939839,0.919343,0.939155,0.864295,0.900172,0.982633
BaggingClassifier,0.927228,0.911404,0.89606,0.868906,0.882274,0.966488
ExtraTreesClassifier,0.966922,0.957487,0.961277,0.932148,0.946488,0.993387
GradientBoostingClassifier,0.951623,0.935436,0.950843,0.891963,0.920462,0.988188
RandomForestClassifier,0.960513,0.94817,0.957271,0.91502,0.935669,0.992854
LogisticRegressionCV,0.961133,0.951303,0.949932,0.924901,0.93725,0.981345
RidgeClassifierCV,0.908414,0.890546,0.862441,0.842556,0.852383,
Perceptron,0.831921,0.843028,0.681234,0.872859,0.765232,
KNeighborsClassifier,0.853628,0.787353,0.889423,0.609354,0.723221,0.909979
SVC,0.737441,0.586691,0.907895,0.181818,0.302964,0.929313


# Hyperparameter Tunning

## Extra Trees Classifier

In [38]:
extra_trees_clf = ensemble.ExtraTreesClassifier(random_state=42)

CV_score = cross_val_score(extra_trees_clf, X_train_transformed, y_train, cv=3, verbose=3)
CV_score

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.955) total time=   0.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] END ................................ score: (test=0.953) total time=   0.8s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.6s remaining:    0.0s


[CV] END ................................ score: (test=0.975) total time=   0.8s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.4s finished


array([0.95544554, 0.9528536 , 0.9751861 ])

In [39]:
extra_trees_clf.fit(X_train_transformed, y_train)
y_pred = extra_trees_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 95.98%
Recall: 92.69%


In [40]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 250, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 6, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 5, 7, 9]
}

extra_trees = ensemble.ExtraTreesClassifier(random_state=42)
grid_extra_trees = GridSearchCV(extra_trees, param_grid)

In [41]:
grid_extra_trees.fit(X_train_transformed, y_train)

In [42]:
grid_extra_trees.best_params_

{'criterion': 'gini',
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 250}

In [43]:
best_model = grid_extra_trees.best_estimator_
y_pred = best_model.predict(X_test_transformed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3319
           1       0.97      0.93      0.95      1518

    accuracy                           0.97      4837
   macro avg       0.97      0.96      0.96      4837
weighted avg       0.97      0.97      0.97      4837



In [44]:
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.59%
Recall: 93.21%


## Random Forest Classifier

In [45]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 20, None],    
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],       
    'max_features': ['sqrt', 'log2']
}

log_clf = ensemble.RandomForestClassifier(random_state=42)
grid_random_forest_clf = GridSearchCV(log_clf, param_grid)
grid_random_forest_clf.fit(X_train_transformed, y_train)

grid_random_forest_clf.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [46]:
random_forest_clf = ensemble.RandomForestClassifier(random_state=42)

CV_score = cross_val_score(random_forest_clf, X_train_transformed, y_train)
CV_score

array([0.96694215, 0.96694215, 0.96280992, 0.95867769, 0.98760331])

In [47]:
random_forest_clf = ensemble.RandomForestClassifier(random_state=42)

random_forest_clf.fit(X_train_transformed, y_train)
y_pred = random_forest_clf.predict(X_test_transformed)

print(classification_report(y_test, y_pred))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3319
           1       0.96      0.91      0.94      1518

    accuracy                           0.96      4837
   macro avg       0.96      0.95      0.95      4837
weighted avg       0.96      0.96      0.96      4837

Precision: 95.79%
Recall: 91.37%


In [48]:
best_model = grid_random_forest_clf.best_estimator_
y_pred = best_model.predict(X_test_transformed)

print(classification_report(y_test, y_pred))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3319
           1       0.96      0.91      0.94      1518

    accuracy                           0.96      4837
   macro avg       0.96      0.95      0.95      4837
weighted avg       0.96      0.96      0.96      4837

Precision: 95.79%
Recall: 91.37%


The hyperparameters obtained through Grid Search are the same as the base model's parameters, which is why both have the same performance.

## Logistic Regression

In [49]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [200, 500, 1000]
}

log_clf = LogisticRegression(random_state=42)
grid_log_clf = GridSearchCV(log_clf, param_grid)
grid_log_clf.fit(X_train_transformed, y_train)

grid_log_clf.best_params_

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

75 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dudac\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dudac\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dudac\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

{'C': 1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}

In [50]:
log_clf = LogisticRegression(random_state=42, max_iter=1000)
log_clf.fit(X_train_transformed, y_train)
y_pred = log_clf.predict(X_test_transformed)

print(classification_report(y_test, y_pred))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3319
           1       0.94      0.93      0.94      1518

    accuracy                           0.96      4837
   macro avg       0.96      0.95      0.95      4837
weighted avg       0.96      0.96      0.96      4837

Precision: 94.34%
Recall: 93.28%


In [51]:
best_model = grid_log_clf.best_estimator_
y_pred = best_model.predict(X_test_transformed)

print(classification_report(y_test, y_pred))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3319
           1       0.94      0.93      0.94      1518

    accuracy                           0.96      4837
   macro avg       0.96      0.95      0.95      4837
weighted avg       0.96      0.96      0.96      4837

Precision: 94.34%
Recall: 93.28%


# Trying to Increase Recall

## Extra Trees Classifier

In [52]:
param_grid ={
    'n_estimators': [100, 250, 500],
    'min_samples_split': [2, 6, 10],
    'min_samples_leaf': [1, 3, 5],
    'class_weight': [{0: 1, 1: w} for w in [1, 3, 5]]
}


grid = GridSearchCV(estimator=ensemble.ExtraTreesClassifier(), param_grid=param_grid, scoring='recall', cv=3)
grid.fit(X_train_transformed, y_train)

grid.best_params_

{'class_weight': {0: 1, 1: 5},
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 250}

In [53]:
clf = grid.best_estimator_
y_pred = clf.predict(X_test_transformed)

print(classification_report(y_test, y_pred))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      3319
           1       0.88      0.99      0.93      1518

    accuracy                           0.95      4837
   macro avg       0.93      0.96      0.95      4837
weighted avg       0.96      0.95      0.95      4837

Precision: 87.50%
Recall: 98.68%


It was possible to increase recall at the cost of precision, specifically the precision of spam emails. This happened because, by increasing the weight of the spam class, the model is forced to correctly identify more instances of this class, making it less cautious when classifying an email as spam, which decreases precision.

# Conclusion

The choice of the most suitable model depends on the goals we want to achieve. If we wanted to maximize the amount of spam identified, we could opt for the last model, which has a higher recall.

However, if we wanted to ensure the correctness of predictions and avoid losing important emails that could be incorrectly classified as spam, we should consider models with higher precision.