# Exercise 4

Apache SpamAssasin dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

## Getting the data

In [2]:
import tarfile
from pathlib import Path
import urllib.request

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / ".." / "data" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [3]:
ham_dir, spam_dir = fetch_spam_data()

In [4]:
def load_spam_data(directory):
    emails = []
    for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="latin1") as email_file:
                    emails.append(email_file.read())
    return emails

In [5]:
ham_data = load_spam_data(ham_dir)
spam_data = load_spam_data(spam_dir)

print(f"Number of spam emails: {len(spam_data)}")
print(f"Number of ham emails: {len(ham_data)}")

Number of spam emails: 501
Number of ham emails: 2502


In [6]:
type(ham_data[0])

str

In [7]:
def create_email_dataframe(emails, label):
    df = pd.DataFrame({
        "Email": emails,
        "Label": label
    })
    return df

ham_df = create_email_dataframe(ham_data,  0)
spam_df = create_email_dataframe(spam_data, 1)

email_df = pd.concat([ham_df, spam_df], ignore_index=True)

email_df.sample(10)

Unnamed: 0,Email,Label
1329,From spamassassin-commits-admin@lists.sourcefo...,0
2633,From andijeanehuaa@yahoo.com Wed Aug 28 10:43...,1
181,From timc@2ubh.com Wed Aug 28 10:54:56 2002\n...,0
2810,From tps@insiq.us Mon Sep 16 00:27:17 2002\nR...,1
343,From fork-admin@xent.com Tue Aug 27 17:34:36 ...,0
627,From fork-admin@xent.com Thu Sep 19 11:04:59 ...,0
1926,From rssfeeds@jmason.org Thu Sep 26 16:31:16 ...,0
929,From tony@svanstrom.com Fri Aug 23 11:05:51 2...,0
2755,From arnoldm@aol.com Mon Sep 9 19:31:32 2002...,1
555,From fork-admin@xent.com Wed Sep 11 14:21:29 ...,0


In [8]:
for i in range (2500, 2511):
    print(email_df.loc[i, "Label"])

0
0
1
1
1
1
1
1
1
1
1


In [9]:
# shuffeling the data

email_df = email_df.sample(frac=1, random_state=42)
email_df.shape

(3003, 2)

In [10]:
email_df

Unnamed: 0,Email,Label
2211,From rssfeeds@jmason.org Thu Oct 3 12:24:36 ...,0
2905,From 107664.1420@actionsports.co.uk Sat Sep 2...,1
1411,From glynn.clements@virgin.net Wed Sep 4 18:...,0
251,From ilug-admin@linux.ie Fri Sep 6 11:40:19 ...,0
794,From fork-admin@xent.com Mon Sep 30 13:52:43 ...,0
...,...,...
1638,Return-Path: guido@python.org\nDelivery-Date: ...,0
1095,From rpm-list-admin@freshrpms.net Mon Sep 9 ...,0
1130,From exmh-workers-admin@redhat.com Mon Sep 23...,0
1294,From rpm-list-admin@freshrpms.net Tue Oct 8 ...,0


In [18]:
# Splitting the dataset into X and y

X = email_df.loc[:, "Email"]
y = email_df.loc[:, "Label"]

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (3003,)
Shape of y: (3003,)


### Preprocessing the data

*Preprocessing steps:*
- lower casing
- removal of punctuations
- removal of stopwords
- removal of frequent words
- stemming
- lemmatisation
- conversion of emoticons to words
- removal of words
- removal of html tags
- spelling correction
- removal of rare words

In [19]:
# lower casing

X_lower = X.apply(str.lower)

In [20]:
# removal of punctuation
# string.puntuation = {!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}

import string 

def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else " " for ch in text])

X_lower_nopunc = X_lower.apply(remove_punctuation)

In [21]:
# removal of stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def tokenization(text):
    tk = WhitespaceTokenizer()
    return tk.tokenize(text)

def remove_stopwords(text):
    return [token for token in text if token not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Finn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
X_lower_nopunc_token = X_lower_nopunc.apply(tokenization)
X_lower_nopunc_token_nostopw = X_lower_nopunc_token.apply(remove_stopwords)

In [23]:
# removal of frequent words. Alternative to removing stopwords.

from collections import Counter
cnt = Counter()
for text in X_lower_nopunc_token:
    for word in text:
        cnt[word] += 1

In [38]:
frequent_words = set([w for (w, wc) in cnt.most_common(20)])

def remove_frequent_words(text):
    return [token for token in text if token not in frequent_words]

In [40]:
X_lower_nopunc_token_nofreqw = X_lower_nopunc_token.apply(remove_frequent_words)

In [59]:
# removal of rare words

n_rare_words = 200
rare_words = set([w for (w, wc) in cnt.most_common()[len(cnt.most_common())-n_rare_words:]])

def remove_rare_words(text):
    return [word for word in text if word not in rare_words]

In [60]:
X_lower_nopunc_token_nofreqw_norarew = X_lower_nopunc_token_nofreqw.apply(remove_rare_words)

In [50]:
# Stemming

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return [stemmer.stem(word) for word in text]

In [51]:
text = ["happier", "dog", "bye", "misserable", "running"]
print(stem_words(text))

['happier', 'dog', 'bye', 'misser', 'run']


In [64]:
# lemmatization 

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return [lemmatizer.lemmatize(word) for word in text]

In [84]:
text = ["happier", "dogs", "best", "sources", "running"]
print(lemmatize_words(text))

['happier', 'dog', 'best', 'source', 'running']


**Preprocessing Pipeline Order**
1. Lower casing
2. Replacing URLs with "URL"
3. Replacing numbers with "Number"
4. Replacing emoticons with words
5. Removal of html tags
6. Removal of punctuation
7. Tokenization
8. Lemmatization
9. Removal of frequent words
10. Removal of rare words

In [104]:
import re
import string
from nltk.stem import WordNetLemmatizer

def replace_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_pattern, 'URL', text)

def replace_numbers(text):
    number_pattern = r'\d+'
    return re.sub(number_pattern, 'NUMBER', text)

def replace_emoticons(text):
    emoticons = {
        ':)': 'smile',
        ':(': 'sad',
        ':D': 'grin',
        ':P': 'playful',
        ':-)': 'smile',
        ':-(': 'sad',
    }
    pattern = re.compile('|'.join(re.escape(emoticon) for emoticon in emoticons.keys()))
    return pattern.sub(lambda match: emoticons[match.group(0)], text)

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else " " for ch in text])

lemmatizer = WordNetLemmatizer()
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

def preprocess_email(email:str):
    text = email.lower() # lower casing
    text = replace_urls(text) # replacing URLs
    text = replace_numbers(text) # replacing numbers 
    text = replace_emoticons(text) # replacing emoticons
    text = remove_html_tags(text) # removing html tags
    text = remove_punctuation(text) # removing punctuation
    tokens = text.split(" ") # tokenization with " "
    tokens = lemmatize_words(tokens) # lemmatization

In [105]:
# custom transformer for removing frequent and rare words

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from collections import Counter
cnt = Counter()
for text in X_lower_nopunc_token:
    for word in text:
        cnt[word] += 1

class FreqRareWordRemover(BaseEstimator, TransformerMixin):
    def __init__(self, n_frequent_words:int, n_rare_words:int):
        self.n_frequent_words = n_frequent_words
        self.n_rare_words = n_rare_words

    def fit(self, X, y=None):
        X = check_array(X, dtype=object)
        self.cnt_ = Counter()
        for _, text in X.iterrows():
            for token in text:
                self.cnt_[token] += 1
        self.FREQUENT_WORDS_ = set([w for (w, wc) in cnt.most_common(self.n_frequent_words)])
        self.RARE_WORDS_ = set([w for (w, wc) in cnt.most_common()[len(self.cnt_.most_common())-self.n_rare_words:]])
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        X = check_array(X, dtype=object)
        assert self.n_features_in_ == X.shape[1]
        filtered_texts = []
        for _, text in X.iterrows():
            filtered_row = [word for word in text if (word not in self.FREQUENT_WORDS_ and word not in self.RARE_WORDS_)]
            filtered_texts.append(filtered_row)
        return pd.DataFrame(filtered_texts)

In [106]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

preprocessing = Pipeline([
    ("basic_preprocessing", FunctionTransformer(func=lambda X: X.apply(preprocess_email))),
    ("freqRareWordRomover", FreqRareWordRemover(20, 200))
])

In [107]:
preprocessed_emails = preprocessing.fit_transform(X)

ValueError: Expected 2D array, got 1D array instead:
array=[None None None ... None None None].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.