In [1]:
import time
startTime = time.time()

# Magic

In [2]:
# ! pip install pandas pandas-profiling catboost seaborn xgboost scikit-learn nltk

# Import

In [3]:
from multiprocessing import Pool

import pandas as pd
from bs4 import BeautifulSoup
# from pandas_profiling import ProfileReport
import os
import email
import email.policy
import seaborn as sns
import numpy as np
import catboost
import xgboost
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import nltk
import re
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [4]:
# download stopwords at the beginning for later usage
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/finn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load Data

In [5]:
path = './Data/'
# list of loaded emails
mails = []
# list of all available labels
labels = ['easy_ham','hard_ham', 'spam', 'spam_2']
# list of labels for the loaded emails
content_labels = []

# iterate over the datastructure by combining path with the labels
for label in labels:
    filenames = os.listdir(path + label +'/')
    for file in filenames:
        f = open((path + label + '/' + file), 'r', encoding = 'latin1')
        content = f.read()
        mails.append(content)
        content_labels.append(label)

# create the dataDframe from the list of mails and labels
df = pd.DataFrame({'emails': mails, 'label': content_labels})

# Data Analysis
Get a rough overview over the loaded data

In [6]:
print(f"Size of Spam Data: {len(os.listdir('./Data/spam/'))}")
print(f"Size of Spam 2 Data: {len(os.listdir('./Data/spam_2/'))}")
print(f"Size of easy Ham Data: {len(os.listdir('./Data/easy_ham/'))}")
print(f"Size of hard Ham Data: {len(os.listdir('./Data/hard_ham/'))}")

Size of Spam Data: 1002
Size of Spam 2 Data: 1398
Size of easy Ham Data: 5052
Size of hard Ham Data: 501


In [7]:
# ProfileReport(df, title="Pandas Profiling Report")

## Data Cleanup
### Remove Duplicates
Instead of removing all duplicates at once the duplicates are removed for each label on its own. This ensures that there are no duplicates across the different segments of the dataset.

In [8]:
print('Mails per label in the DataFrame before removal of duplicates:')
display(df.groupby('label').count())
for label in labels:
    # drop duplicates for one lable at a time
    df[df['label'] == label] = df[df['label'] == label].drop_duplicates(subset=['emails'])
print(f'Mails per label in the DataFrame after removal of duplicates: ')
display(df.groupby('label').count())

Mails per label in the DataFrame before removal of duplicates:


Unnamed: 0_level_0,emails
label,Unnamed: 1_level_1
easy_ham,5052
hard_ham,501
spam,1002
spam_2,1398


Mails per label in the DataFrame after removal of duplicates: 


Unnamed: 0_level_0,emails
label,Unnamed: 1_level_1
easy_ham,4911
hard_ham,468
spam,992
spam_2,1398


Finding: There were several duplicates in the different dataset. The removal of this duplicate entries prevents that these duplicates weighted multiple times and influence the models.

In [9]:
df = df.drop_duplicates(subset=['emails'])
print('Shape of df after removal of duplicates: ')
display(df.groupby('label').count())

Shape of df after removal of duplicates: 


Unnamed: 0_level_0,emails
label,Unnamed: 1_level_1
easy_ham,4911
hard_ham,468
spam,992
spam_2,1398


Finding: The removal of duplicates across all labels at once show that duplicates exist only within a class, but not across class boundaries.


### Remove empty Cells
Cells are removed if one cell in the row is empty.

In [10]:
print('Mails per label in the DataFrame before the removal of empty rows: ')
display(df.groupby('label').count())
df.dropna(how='any', inplace = True)
print('Mails per label in the DataFrame after the removal of empty rows: ')
display(df.groupby('label').count())

Mails per label in the DataFrame before the removal of empty rows: 


Unnamed: 0_level_0,emails
label,Unnamed: 1_level_1
easy_ham,4911
hard_ham,468
spam,992
spam_2,1398


Mails per label in the DataFrame after the removal of empty rows: 


Unnamed: 0_level_0,emails
label,Unnamed: 1_level_1
easy_ham,4911
hard_ham,468
spam,992
spam_2,1398


Finding: There were no empty Rows in the DataFrame. -> no empty emails

### Create proper labels
Spam and not spam are the classes that should later be classified. The current labels ('easy_ham', 'hard_ham', 'spam', 'spam_2') should be preserved for later analysis. A label encoder is not used, instead the labels are assigned manually.

In [11]:
df['former_label'] = df['label']
# change labels from the old ones to spam and not spam
df.loc[df['label'].str.contains('spam') == True,'label'] = 'spam'
df.loc[df['label'].str.contains('ham') == True,'label'] = 'ham'

In [12]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

## Transform loaded String messages into E-Mail objects

In [13]:
# Transform the emails into an Email Object
df['emails'] = df['emails'].transform(lambda emails: email.message_from_string(emails, policy= email.policy.EmailPolicy(utf8=True)))

## Extract features from E-Mail the Object

In [14]:
# TODO instead of counting how often an emailtype is present build a string that shows how the types are nested
def extract_features_from_email(emails: email):
    # declare and initialize a dictionary that is later used for creating the Series.
    # The initialization already contains the inforamtion if the email is a multipart message because it is just read from the object.
    values = {"is_multipart": bool(emails.is_multipart())}

    # helper function to add / append a value to the values dictionary.
    # strings are appended with a ' ' inbetween
    def add_to_dict(key, value):
        if key not in values:
            values[key] = value
        else:
            if isinstance(value, str):
                values[key] +=  " " + value
            else:
                values[key] += value

    # walk the email messages.
    # Multipart messages contain multiple header fields (see  RFC 2045, RFC 2046, RFC 2047, RFC 4288, RFC 4289 and RFC 2049 for further information)
    for part in emails.walk():
        # get the type of the current part of the email
        message_content_type = part.get_content_type()
        # count how often each content type is present in the email conversation by adding or incrementing the entry in the dictionary
        add_to_dict('content_type_' + message_content_type, 1)
        if message_content_type not in ['text/plain','text/html']:
            # continue because it is not possible to further interpret this part of the message.
            # if a multipart message is determined the "children" have been or will be visited by the walk of the e mail object
            continue
        try:
            message_content = part.get_content()
        except Exception as e:
            # encoding error take the complete payload as body. Will overcome this issue in later steps
            message_content = part.get_payload()
        if message_content_type == 'text/plain':
            add_to_dict('content', message_content)
        else:
            # decode the html back to plain text by bs4
            try:
                soup = BeautifulSoup(message_content, 'html.parser')
                add_to_dict('content', soup.text)
            except:
                add_to_dict('content', "empty")
    # assumption the subjects of the different messages within an conversation do not differ significant
    add_to_dict('subject', emails['subject'])
    return pd.Series(values)

In [16]:
# Extract features from the email Object and add them as new column
# fillna can is used to fill content_types that are not present in a E-Mail but in others. In this case the fields are initialized with NaN but 0 is correct.
df = df.join(df.apply(lambda x: extract_features_from_email(x['emails']),axis=1).fillna(0))

In [18]:
# count how often each ignored character is ignored

# string_of_all_ignored_characters = str([' '.join(df['content'].transform(lambda x: message_content_cleanup(str(x))).transform(lambda x: re.sub('[a-zA-Z]', '', str(x))))]).replace(' ','')
# count = {}
# for i in string_of_all_ignored_characters:
#     if i in count: #check if it exists in dictionary
#         count[i] += 1
#     else:
#         count[i] = 1 #first occurrence of character
# print(f'number of ignored characters: {len(count.keys())}')
# dict(sorted(count.items(), key=lambda item: item[1], reverse=True))

In [19]:
# for message in df[df['is_multipart'] == True].iloc[0].emails.walk():
#     print(message['subject'])
#     print(message.get_content_type())
#     print(message)
#     print('---------')

In [20]:
def message_content_cleanup(message_content: str):
    # transform the text to lower case
    message_content = message_content.lower()
    # remove linebreaks and tabs
    message_content = message_content.replace('\t', ' ')
    message_content = message_content.replace('\n', ' ')
    # separate punctuation from surrounding text
    message_content = message_content.replace('.',' . ')
    message_content = message_content.replace(',',' , ')
    message_content = message_content.replace('!',' ! ')
    message_content = message_content.replace('?',' ? ')
    # remove double spaces that might have be introduced in the previous step
    message_content = message_content.replace('  ',' ')
    return message_content

In [21]:
# create word stemmer object for usage in the function
stemmer = SnowballStemmer("english")

def build_word_stems(message_content: str):
    message_content = re.sub('[^a-zA-Z]', ' ', message_content)
    message_content = message_content.split()
    message_content = [stemmer.stem(word) for word in message_content if word not in stopwords.words('english')]
    message_content = ' '.join(message_content)
    return message_content

In [22]:
def parallelize_dataframe(df, func, n_cores=os.cpu_count()):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def process_content_and_subject(local_df):
    local_df['content'] = local_df['content'].transform(lambda x: message_content_cleanup(str(x)))
    local_df['content_stemmed'] = local_df['content'].transform(lambda x: build_word_stems(str(x)))
    local_df['subject'] = local_df['subject'].transform(lambda x: message_content_cleanup(str(x)))
    local_df['subject_stemmed'] = local_df['subject'].transform(lambda x: build_word_stems(str(x)))
    return local_df

In [24]:
df = parallelize_dataframe(df, process_content_and_subject)

In [26]:
content_count_vectorizer = CountVectorizer(max_features=100)
subject_count_vectorizer = CountVectorizer(max_features=2000)

content_vectorized = content_count_vectorizer.fit_transform(df['content_stemmed'].to_numpy()).toarray()
subject_vectorized = subject_count_vectorizer.fit_transform(df['subject_stemmed'].to_numpy()).toarray()

X_content = pd.DataFrame(content_vectorized, columns=  ["content_" + x for x in content_count_vectorizer.get_feature_names_out()])
X_subject = pd.DataFrame(subject_vectorized, columns=  ["subject_" + x for x in subject_count_vectorizer.get_feature_names_out()])

In [28]:
X = X_content.join(X_subject)
X = X.join(df.filter(regex='content_type_*'))
X.shape

(7769, 2122)

In [29]:
y = df['label']
y.shape

(7769,)

In [30]:
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *
estimators_and_hyperparameters=[
    (CatBoostClassifier(random_state=42, thread_count=-1, silent= True),{}),
    (XGBClassifier(random_state=42, n_jobs=-1, silent= True),{}),
    (SVC(kernel='linear',random_state=42),{}),
    (SVC(kernel='poly',random_state=42),{}),
    (SVC(kernel='rbf',random_state=42),{}),
    (SVC(kernel='sigmoid',random_state=42),{}),
    #(SVC(kernel='precomputed',random_state=42),{}),
    (BernoulliNB(),{}),
    #(CategoricalNB(),{}),
    (ComplementNB(),{}),
    (GaussianNB(),{}),
    (MultinomialNB(),{}),
    (DecisionTreeClassifier(random_state=42),{}),
    (KNeighborsClassifier(n_jobs=-1),{}),
    (RandomForestClassifier(random_state=42, n_jobs=-1), {}),
    (SGDClassifier(),{})
]

In [31]:
from sklearn.preprocessing import *

scalers = [
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    QuantileTransformer(output_distribution='uniform'),
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(),
    StandardScaler(),
]

In [32]:
X = X.fillna(0)

In [None]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean
for scaler in scalers:
    print(f'Current Sclaer: {scaler.__class__.__name__}')
    for estimator in estimators_and_hyperparameters:
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        try:
            X_trans = scaler.fit_transform(X)
            scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
            print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
        except Exception as e:
            print(e)
            print(f'Skipping the combination of {scaler.__class__.__name__} and {estimator.__scaler__.__name__}')
        print('-----------------------------------------------------------------')

Current Sclaer: MaxAbsScaler


In [None]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))