In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import tarfile
import urllib.request

In [3]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url = HAM_URL, spam_url = SPAM_URL, spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [4]:
fetch_spam_data()

In [5]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]


In [6]:
len(ham_filenames)

2500

In [7]:
len(spam_filenames)

500

In [8]:
import email
import email.policy

def load_email(is_spam, filename, spam_path = SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path,directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [9]:
ham_emails = [load_email(is_spam=False, filename = name) for name in ham_filenames]
spam_emails = [load_email(is_spam = True, filename = name) for name in spam_filenames]


In [10]:
def get_email_structure(email):
    if isinstance(email,str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [11]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [12]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [13]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [14]:
for header, value in spam_emails[0].items():
    print(header, ": ", value)

Return-Path :  <12a1mailbot1@web.de>
Delivered-To :  zzzz@localhost.spamassassin.taint.org
Received :  from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received :  from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received :  from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From :  12a1mailbot1@web.de
Received :  from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To :  dcek1a1@netsgo.com
Subject :  Life Insurance - Why Pay More?
Date :  Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version :  1.0
Message-ID :  <0103c1042001882DD_IT7@dd_it7>
Content-Type :  text/html; charset="iso-8859-1"
Content-Transfer-

In [17]:
print(spam_emails[0]['Subject'])

Life Insurance - Why Pay More?


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = np.array(ham_emails + spam_emails, dtype = object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))


In [23]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.2, random_state= 42)

Converting HTML to plain text

In [24]:
import re
from html import unescape

In [25]:
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [29]:
html_spam_emails = [email for email in train_X[train_y == 1] if get_email_structure(email) == 'text/html']
sample_html_spam = html_spam_emails[5]
print(sample_html_spam.get_content().strip(), "....")

<html><body><center>

<table bgcolor="663399" border="2" width="999" cellspacing="0" cellpadding="0">
  <tr>
    <td colspan="3" width="999"> <hr><font color="yellow"> 
<center>
<font size="7"> 
<br><center><b>Get 12 FREE VHS or DVDs! </b><br>
<table bgcolor="white" border="2" width="500">
  <tr>    <td>
 <font size="7"> <font color="003399"><center>Click <a href="http://www.bozomber.com/porno/index.html"> HERE For Details!</a>
<font size="5"><br>
</td></tr></table> <br> 

<table bgcolor="#CCFF33" border="2" width="600">
  <tr>    <td><center><center><font size="6"><font color="6633CC"><br>
We Only Have HIGH QUALITY <br>Porno Movies to Choose From!<br><br>
 
 "This is a <i>VERY SPECIAL, LIMITED TIME OFFER</i>."<br><br> Get up to 12 DVDs absolutely FREE,<br> with<a href="http://www.bozomber.com/porno/index.html"> NO COMMITMENT!</a> 
 <br><br>
There's <b>no better deal anywhere</b>.<br>
There's <i>no catches</i> and <i>no gimmicks</i>. <br>You only pay for the shipping,<br> and the DVDs 

In [30]:
#after parsing HTML
print(html_to_plain_text(sample_html_spam.get_content()))


Get 12 FREE VHS or DVDs!
  Click  HYPERLINK  HERE For Details!
We Only Have HIGH QUALITY Porno Movies to Choose From!
 "This is a VERY SPECIAL, LIMITED TIME OFFER." Get up to 12 DVDs absolutely FREE, with HYPERLINK  NO COMMITMENT!
There's no better deal anywhere.
There's no catches and no gimmicks. You only pay for the shipping, and the DVDs are absolutely free!
Take a Peak at our HYPERLINK   Full Catalog!
 High quality cum filled titles such as:
 HYPERLINK  500 Oral Cumshots 5
Description: 500 Oral Cum Shots! I need hot jiz on my face! Will you cum in my mouth?
 Dozens of Dirty Hardcore titles such as:
 HYPERLINK  Amazing Penetrations No. 17
Description: 4 full hours of amazing penetrations with some of the most beautiful women in porn!
 From our "Sexiest Innocent Blondes" collections:
 HYPERLINK  Audition Tapes
Description: Our girls go from cute, young and innocent, to screaming sex goddess
 beggin' to have massive cocks in their tight, wet pussies and asses!



In [37]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: #encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [38]:
print(email_to_text(sample_html_spam))


Get 12 FREE VHS or DVDs!
  Click  HYPERLINK  HERE For Details!
We Only Have HIGH QUALITY Porno Movies to Choose From!
 "This is a VERY SPECIAL, LIMITED TIME OFFER." Get up to 12 DVDs absolutely FREE, with HYPERLINK  NO COMMITMENT!
There's no better deal anywhere.
There's no catches and no gimmicks. You only pay for the shipping, and the DVDs are absolutely free!
Take a Peak at our HYPERLINK   Full Catalog!
 High quality cum filled titles such as:
 HYPERLINK  500 Oral Cumshots 5
Description: 500 Oral Cum Shots! I need hot jiz on my face! Will you cum in my mouth?
 Dozens of Dirty Hardcore titles such as:
 HYPERLINK  Amazing Penetrations No. 17
Description: 4 full hours of amazing penetrations with some of the most beautiful women in porn!
 From our "Sexiest Innocent Blondes" collections:
 HYPERLINK  Audition Tapes
Description: Our girls go from cute, young and innocent, to screaming sex goddess
 beggin' to have massive cocks in their tight, wet pussies and asses!



## stemming

In [31]:
import nltk

In [32]:
stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [34]:
import urlextract

In [36]:
url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("Will it detect https://www.kakraninsights.com"))

['https://www.kakraninsights.com']


## transform to text

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin

In [53]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, strip_headers=True, lower_case = True, remove_punctuation=True,
                replace_urls = True, replace_numbers = True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
    def fit(self, X,y = None):
        return self
    
    def transform(self, X,y= None):
        
        X_transformed = []
        
        for email in X:
            text = email_to_text(email) or ""
            
            if self.lower_case:
                text = text.lower()
                
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, "URL")
            
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags = re.M)
                
            word_counts = Counter(text.split())
            
            if self.stemming and stemmer is not None:
                
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
                
            X_transformed.append(word_counts)
        return np.array(X_transformed)


In [56]:
X_few = train_X[:3]

In [57]:
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom

## build dictionary and word vectors

In [58]:
from scipy.sparse import csr_matrix

In [70]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size = 1000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None):
        total_count = Counter()
        
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = { word: index + 1 for index, (word,count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y = None):
        
        rows = []
        cols = []
        data = []
        
        for row,word_count in enumerate(X):
            for word,count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word,0))
                data.append(count)
                
        return csr_matrix((data,(rows,cols)), shape = (len(X), self.vocabulary_size + 1))
        

In [71]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [73]:
vocab_transformer.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

## training

In [74]:
from sklearn.pipeline import Pipeline

In [75]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

In [76]:
train_X_trfmd = preprocess_pipeline.fit_transform(train_X)

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [86]:
log_clf = LogisticRegression(solver="lbfgs", max_iter = 1000, random_state= 42)

In [87]:
from sklearn.model_selection import cross_val_score

In [92]:
Logistic_score = cross_val_score(log_clf, train_X_trfmd, train_y, cv=3,verbose=3 )

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.981) total time=   0.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] END ................................ score: (test=0.984) total time=   0.3s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] END ................................ score: (test=0.990) total time=   0.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s finished


In [93]:
Logistic_score.mean()

0.985

In [91]:
bayes_clf = GaussianNB()

In [97]:
bayes_score = cross_val_score(bayes_clf, train_X_trfmd.toarray(), train_y, cv=3,verbose=3 )

[CV] END ................................ score: (test=0.877) total time=   0.1s
[CV] END ................................ score: (test=0.800) total time=   0.0s
[CV] END ................................ score: (test=0.812) total time=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [99]:
bayes_score.mean()

0.8300000000000001