# Exercise 4

Apache SpamAssasin dataset

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

## Getting the data

In [8]:
import tarfile
from pathlib import Path
import urllib.request

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / ".." / "data" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [9]:
ham_dir, spam_dir = fetch_spam_data()

In [10]:
def load_spam_data(directory):
    emails = []
    for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="latin1") as email_file:
                    emails.append(email_file.read())
    return emails

In [11]:
ham_data = load_spam_data(ham_dir)
spam_data = load_spam_data(spam_dir)

print(f"Number of spam emails: {len(spam_data)}")
print(f"Number of ham emails: {len(ham_data)}")

Number of spam emails: 501
Number of ham emails: 2502


In [12]:
type(ham_data[0])

str

In [13]:
def create_email_dataframe(emails, label):
    df = pd.DataFrame({
        "Email": emails,
        "Label": label
    })
    return df

ham_df = create_email_dataframe(ham_data,  0)
spam_df = create_email_dataframe(spam_data, 1)

email_df = pd.concat([ham_df, spam_df], ignore_index=True)

email_df.sample(10)

Unnamed: 0,Email,Label
2883,From bellaii@hotmail.com Fri Sep 20 11:41:09 ...,1
1354,From spamassassin-talk-admin@lists.sourceforge...,0
809,From fork-admin@xent.com Mon Sep 30 17:56:33 ...,0
1164,From exmh-users-admin@redhat.com Fri Aug 23 1...,0
1570,From razor-users-admin@lists.sourceforge.net ...,0
2775,From FreeSoftware-6680b00@yahoo.com Wed Sep 1...,1
1026,From exmh-users-admin@redhat.com Thu Sep 19 1...,0
418,From fork-admin@xent.com Wed Sep 4 11:42:09 ...,0
2258,From rssfeeds@jmason.org Fri Oct 4 11:02:12 ...,0
1287,From rpm-list-admin@freshrpms.net Mon Sep 2 ...,0


In [14]:
for i in range (2500, 2511):
    print(email_df.loc[i, "Label"])

0
0
1
1
1
1
1
1
1
1
1


In [17]:
# shuffeling the data

email_df = email_df.sample(frac=1, random_state=42)
email_df.shape

(3003, 2)

In [19]:
email_df

Unnamed: 0,Email,Label
1531,From craig@hughes-family.org Sat Oct 5 12:38...,0
1212,From rpm-list-admin@freshrpms.net Tue Oct 1 ...,0
2247,From rssfeeds@jmason.org Fri Oct 4 11:01:58 ...,0
1782,From pudge@perl.org Wed Sep 11 13:43:29 2002\...,0
1582,From razor-users-admin@lists.sourceforge.net ...,0
...,...,...
2807,From jzlin@bacalhau.com.br Mon Sep 16 00:11:0...,1
2760,Return-Path: ler@lerami.lerctr.org\nDelivery-D...,1
433,From fork-admin@xent.com Thu Sep 5 11:31:38 ...,0
915,From fork-admin@xent.com Fri Aug 23 11:08:31 ...,0


In [42]:
# Splitting the dataset into X and y

X = email_df.loc[:, "Email"]
y = email_df.loc[:, "Label"]

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (3003,)
Shape of y: (3003,)


### Preprocessing the data

*Preprocessing steps:*
- lower casing
- removal of punctuations
- removal of stopwords
- removal of frequent words
- stemming
- lemmatisation
- conversion of emoticons to words
- removal of words
- removal of html tags
- spelling correction
- removal of rare words

In [43]:
# lower casing

X = X.apply(str.lower)

In [44]:
# removal of punctuation
# string.puntuation = {!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}

import string 

def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else " " for ch in text])

X = X.apply(remove_punctuation)

In [54]:
# removal of stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def tokenization(text):
    tk = WhitespaceTokenizer()
    return tk.tokenize(text)

def remove_stopwords(text):
    return [token for token in text if token not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Finn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
X = X.apply(tokenization)
X = X.apply(remove_stopwords)

In [58]:
X[0]

['exmh',
 'workers',
 'admin',
 'redhat',
 'com',
 'thu',
 'aug',
 '22',
 '12',
 '36',
 '23',
 '2002',
 'return',
 'path',
 'exmh',
 'workers',
 'admin',
 'spamassassin',
 'taint',
 'org',
 'delivered',
 'zzzz',
 'localhost',
 'netnoteinc',
 'com',
 'received',
 'localhost',
 'localhost',
 '127',
 '0',
 '0',
 '1',
 'phobos',
 'labs',
 'netnoteinc',
 'com',
 'postfix',
 'esmtp',
 'id',
 'd03e543c36',
 'zzzz',
 'localhost',
 'thu',
 '22',
 'aug',
 '2002',
 '07',
 '36',
 '16',
 '0400',
 'edt',
 'received',
 'phobos',
 '127',
 '0',
 '0',
 '1',
 'localhost',
 'imap',
 'fetchmail',
 '5',
 '9',
 '0',
 'zzzz',
 'localhost',
 'single',
 'drop',
 'thu',
 '22',
 'aug',
 '2002',
 '12',
 '36',
 '16',
 '0100',
 'ist',
 'received',
 'listman',
 'spamassassin',
 'taint',
 'org',
 'listman',
 'spamassassin',
 'taint',
 'org',
 '66',
 '187',
 '233',
 '211',
 'dogma',
 'slashnull',
 'org',
 '8',
 '11',
 '6',
 '8',
 '11',
 '6',
 'esmtp',
 'id',
 'g7mbyrz04811',
 'zzzz',
 'exmh',
 'spamassassin',
 'taint',

In [None]:
from collections import Counter
cnt = Counter()
for text in 