# Modules

In [32]:
import tarfile
import os
import urllib.request

import email
import email.policy
from bs4 import BeautifulSoup
import re


# Data

## Decompress

In [36]:
# Method 1
# manually download the archive files into the 'raw' directory, 
# then run this code to decompress the archives into 'raw_decompressed' directory
def unpack():
	print('unpacking...')
	archiveFiles = [ i for i in os.listdir('datasets/raw') if '.tar.bz2' in i ]
	for i in archiveFiles:
		with tarfile.open(f'datasets/raw/{i}', 'r:bz2') as tar:
			tar.extractall('datasets/raw_decompressed')
	print('done!')

def check_if_already_unpacked():
	# If directory "datasets/raw_decompressed" exists and it is not empty
	if os.path.exists('datasets/raw_decompressed') and os.listdir('datasets/raw_decompressed') != []:
		print('The target directory is not empty.')
		while True:
			a = input("Do you want to unpack the archived files again? \nPlease type 'yes' or 'no'")
			a2 = a.lower()
			if a2 == 'yes':
				# Call the "unpack()" function
				unpack()
				break
			if a2 == 'no':
				print('exiting the program.')
				break
			else:
				print('Please try again by typing a valid input.')
	else:
		unpack()

if os.path.exists('datasets/raw') and os.listdir('datasets/raw') != []:
	check_if_already_unpacked()
else:
	print("""Directory "datasets/raw" doesn't exist or it doesn't have any files! Please download the files into this directory and then continue""")


The target directory is not empty.
exiting the program.


In [48]:
# Method 2
# download the files from the online directory using 'urllib.request' library

if not os.path.exists('datasets/raw'):
    os.makedirs('datasets/raw')

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
HAM_FILE = os.path.join('datasets', 'raw', 'easy_ham.tar.bz2')
SPAM_FILE = os.path.join('datasets', 'raw', 'spam.tar.bz2')

for i, j in [[HAM_URL, HAM_FILE], [SPAM_URL, SPAM_FILE]]:
    urllib.request.urlretrieve(i, j )
    with tarfile.open(j, 'r:bz2') as tar:
        tar.extractall( os.path.join('datasets', 'raw_decompressed') )

## Read data

In [8]:
# How many files do we have?
HAM_DIR = os.path.join('datasets', 'raw_decompressed', 'easy_ham')
SPAM_DIR = os.path.join('datasets', 'raw_decompressed', 'spam')

def only_emails(path):
    """
    Get only email entries, that is, everything except the 'cmds' 
    technical file. 
    Alternatively, you can filter only those file names that have
    more than 20 characters
    """
    emails_only = [i for i in os.listdir(path) if i != 'cmds']
    return emails_only

spam_email_filenames = only_emails(SPAM_DIR)
ham_email_filenames = only_emails(HAM_DIR)
print(f"Number of entries in SPAM: { len(spam_email_filenames) }")
print(f"Number of entries in HAM:  { len(ham_email_filenames) }")

Number of entries in SPAM: 500
Number of entries in HAM:  2500


In [9]:
# We can use Python's "email" module to parse the emails from text: headers, body, links, etc.

def load_email( pathDir, filename ):
    with open( os.path.join(pathDir, filename), 'rb' ) as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails =  [load_email( os.path.join('datasets', 'raw_decompressed', 'easy_ham'), i ) for i in ham_email_filenames]
spam_emails = [load_email( os.path.join('datasets', 'raw_decompressed', 'spam'), i ) for i in spam_email_filenames]

In [15]:
# Get some info from one example email
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [16]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [17]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [23]:
# Check email headers
for i,j in spam_emails[0].items():
    print(f"{i}: {j}")

Return-Path: <12a1mailbot1@web.de>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received: from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From: 12a1mailbot1@web.de
Received: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To: dcek1a1@netsgo.com
Subject: Life Insurance - Why Pay More?
Date: Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version: 1.0
Message-ID: <0103c1042001882DD_IT7@dd_it7>
Content-Type: text/html; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable

In [25]:
# only "Subject"
spam_emails[0]['Subject']

'Life Insurance - Why Pay More?'

In [31]:
# only HTML body content
spam_emails[0].get_content()

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content="text/html; charset=windows-1252" http-equiv=Content-Type>\n<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none \nstyle="COLOR: black; DISPLAY: none" width="100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT \ncolor=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Why Spend More Than You Have To?\n<CENTER><FONT co

In [43]:
# Use BeautifulSoup to parse HTML
soup = BeautifulSoup(spam_emails[0].get_content(), "html.parser")
text = soup.get_text()
text = re.sub('\n+', '\n', text)
text

"\nSave up to 70% on Life Insurance.\nWhy Spend More Than You Have To?\nLife Quote Savings\nEnsuring your \n      family's financial security is very important. Life Quote Savings makes \n      buying life insurance simple and affordable. We Provide FREE Access to The \n      Very Best Companies and The Lowest Rates.\nLife Quote Savings is FAST, EASY and \n            SAVES you money! Let us help you get started with the best values in \n            the country on new coverage. You can SAVE hundreds or even thousands \n            of dollars by requesting a FREE quote from Lifequote Savings. Our \n            service will take you less than 5 minutes to complete. Shop and \n            compare. SAVE up to 70% on all types of Life insurance! \nClick Here For Your \n            Free Quote!\nProtecting your family is the best investment you'll ever \n          make!\nIf you are in receipt of this email \n      in error and/or wish to be removed from our list, PLEASE CLICK HERE AND TYPE RE