In [1]:
# Required packages:
from typing import Any, List, Tuple, Union
from numpy import ndarray

# For preprocessing text
import string
import re
from nltk.corpus import stopwords

# For training the model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# For evaluating the model
from sklearn import metrics

In [21]:
def load_data(path: str) -> Tuple[List[str], List[str]]:
    """Loads data from file. Each except first (header) is a datapoint
    containing ID, Label, Email (content) separated by "\t".

    Args:
        path: Path to file from which to load data

    Returns:
        List of email contents and a list of labels coresponding to each email.
    """

    emails = []
    labels = []

    with open (path, "r", encoding="mbcs") as file:
        data = file.readlines()
        for line in data[1:]:
            line = line.split("\t")     # Split the line by withespaces
            if line[2][0] == '"':
                line[2] = line[2][1:-2] 
            emails.append(line[2])      # Append the content of the email

            # If the label is "spam" --> append 1, else append 0
            if line[1] == "spam":
                labels.append(1)
            else:
                labels.append(0)

    return emails, labels

In [3]:
def preprocess(doc: str) -> str:
    """Preprocesses text to prepare it for feature extraction.

    Args:
        doc: String comprising the unprocessed contents of some email file.

    Returns:
        String comprising the corresponding preprocessed text.
    """
    
    ## TOKENIZATION ##

    # Replace punctuation marks with spaces using regular expresion
    # First, turn to lowercase so we can replace with the same regex
    # ^ --> not
    # a-z all letter from a to z
    # 0-9 all numbers from 0 to 9
    doc = doc.lower()
    doc = re.sub("[^a-z0-9]", " ", doc)
    
    # Split by whitespaces and turn to lowercase.
    doc = doc.split()


    ## STOPWORD REMOVAL ##

    stop_words = stopwords.words("english")
    doc = [word for word in doc if word not in stop_words]
    
    return " ".join(doc)

In [17]:
def preprocess_multiple(docs: List[str]) -> List[str]:
    """Preprocesses multiple texts to prepare them for feature extraction.

    Args:
        docs: List of strings, each consisting of the unprocessed contents
            of some email file.

    Returns:
        List of strings, each comprising the corresponding preprocessed
            text.
    """
    
    preprocessed_docs = []
    
    for doc in docs:
        preprocessed_docs.append(preprocess(doc))
    
    return preprocessed_docs

In [11]:
def extract_features(
    train_dataset: List[str], test_dataset: List[str]
) -> Tuple[ndarray, ndarray]:
    """Extracts feature vectors from a preprocessed train and test datasets.

    Args:
        train_dataset: List of strings, each consisting of the preprocessed
            email content.
        test_dataset: List of strings, each consisting of the preprocessed
            email content.

    Returns:

    """
    ## EXTRACT VOCABULARY ##
    #vocabulary = set()

    #for doc in train_dataset:
    #    vocabulary |= set(doc.split())

    #vocabulary = list(vocabulary)


    # Try to use the CountVectorizer class of the sklearn library
    count_vect = CountVectorizer()

    ## CREATION OF THE DOCUMENT-TERM MATRIX ##
    train_doc_term_matrix = count_vect.fit_transform(train_dataset)

    #for i in range(len(vocabulary)):
    #    for j in range(len(train_dataset)):
    #        train_doc_term_matrix[j][i] = doc.split().count(vocabulary[i])

    # We need to do the same for the document-term matrix of the test split
    #test_doc_term_matrix = ndarray(shape=(len(test_dataset), len(vocabulary)), dtype=int)
    test_doc_term_matrix = count_vect.transform(test_dataset)

    #for i in range(len(vocabulary)):
    #    for j in range(len(test_dataset)):
    #        test_doc_term_matrix[j][i] = doc.split().count(vocabulary[i])

    return tuple((train_doc_term_matrix, test_doc_term_matrix))

In [12]:
def train(X: ndarray, y: List[int]) -> object:
    """Trains a classifier on extracted feature vectors.

    Args:
        X: Numerical array-like object (2D) representing the instances.
        y: Numerical array-like object (1D) representing the labels.

    Returns:
        A trained model object capable of predicting over unseen sets of
            instances.
    """

    # Model pipeline
    model = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])

    # Train the model
    model.fit(X, y)

    return model

In [13]:
def evaluate(
    y: List[int], y_pred: List[int]
) -> Tuple[float, float, float, float]:
    """Evaluates a model's predictive performance with respect to a labeled
    dataset.

    Args:
        y: Numerical array-like object (1D) representing the true labels.
        y_pred: Numerical array-like object (1D) representing the predicted
            labels.

    Returns:
        A tuple of four values: recall, precision, F_1, and accuracy.
    """

    rec = metrics.recall_score(y, y_pred)
    prec = metrics.precision_score(y, y_pred)
    f1 = metrics.f1_score(y, y_pred)
    acc = metrics.accuracy_score(y, y_pred)

    return rec, prec, f1, acc


In [22]:
print("Loading data...")
train_data_raw, train_labels = load_data("data/train.tsv")
test_data_raw, test_labels = load_data("data/test.tsv")
print("Data loaded")

print(train_data_raw[0])

Loading data...
Data loaded
Received: from NAHOU-MSMBX01V ([192.168.110.39]) by NAHOU-MSMBX05V.corp.enron.com with Microsoft SMTPSVC(5.0.2195.1600); Fri, 29 Jun 2001 08:36:10 -0500 X-MimeOLE: Produced By Microsoft Exchange V6.0.4418.65 content-class: urn:content-classes:message Subject: FW: June 29 -- BNA, Inc. Daily Labor Report MIME-Version: 1.0 Content-Type: text/plain; Content-Transfer-Encoding: binary Date: Fri, 29 Jun 2001 08:36:09 -0500 Message-ID: <77DA52C3FD86904D8209C9750CD310B9C79BB3@NAHOU-MSMBX01V.corp.enron.com> X-MS-Has-Attach: X-MS-TNEF-Correlator: <77DA52C3FD86904D8209C9750CD310B9C79BB3@NAHOU-MSMBX01V.corp.enron.com> Thread-Topic: June 29 -- BNA, Inc. Daily Labor Report Thread-Index: AcEAUaYbkE2KMWxCEdWxEABQi+MJ2QATr4SA From: ""Hu, Sylvia"" <Sylvia.Hu@ENRON.com> To: ""Acevedo, Felecia"" <Felecia.Acevedo@ENRON.com>, ""Brown, MeCole"" <MeCole.Brown@ENRON.com>, ""Cash, Michelle"" <Michelle.Cash@ENRON.com>, ""Castellano, Bonne"" <Bonne.Castellano@ENRON.com>, ""Johnson, Rick

In [16]:
print("Processing data...")
train_data = preprocess_multiple(train_data_raw)
test_data = preprocess_multiple(test_data_raw)
print("Data processed")

Processing data...
Data processed
['received nahou msmbx01v 192 168 110 39 nahou msmbx05v corp enron com microsoft smtpsvc 5 0 2195 1600 fri 29 jun 2001 08 36 10 0500 x mimeole produced microsoft exchange v6 0 4418 65 content class urn content classes message subject fw june 29 bna inc daily labor report mime version 1 0 content type text plain content transfer encoding binary date fri 29 jun 2001 08 36 09 0500 message id 77da52c3fd86904d8209c9750cd310b9c79bb3 nahou msmbx01v corp enron com x ms attach x ms tnef correlator 77da52c3fd86904d8209c9750cd310b9c79bb3 nahou msmbx01v corp enron com thread topic june 29 bna inc daily labor report thread index aceauaybke2kmwxcedwxeabqi mj2qatr4sa hu sylvia sylvia hu enron com acevedo felecia felecia acevedo enron com brown mecole mecole brown enron com cash michelle michelle cash enron com castellano bonne bonne castellano enron com johnson rick rick johnson enron com lynch drew drew c lynch enron com parker gilda gilda parker enron com sullivan 

In [24]:
print("Extracting features...")
train_feature_vectors, test_feature_vectors = extract_features(train_data, test_data)
print("Features Extracted")

Extracting features...
Features Extracted


In [25]:
print("Training...")
classifier = train(train_feature_vectors, train_labels)
print("Classifier trained")

Training...
Classifier trained


In [26]:
print("Applying model on test data...")
predicted_labels = classifier.predict(test_feature_vectors)

print("Evaluating")
recall, precision, f1, accuracy = evaluate(test_labels, predicted_labels)

print(f"Recall:\t{recall}")
print(f"Precision:\t{precision}")
print(f"F1:\t{f1}")
print(f"Accuracy:\t{accuracy}")

Applying model on test data...
Evaluating
Recall:	0.9987439815783965
Precision:	0.9611200644641418
F1:	0.9795708859460014
Accuracy:	0.9759603769026335


In [29]:
print(train_data_raw[0][0:120])

"Received: from NAHOU-MSMBX01V ([192.168.110.39]) by NAHOU-MSMBX05V.corp.enron.com with Microsoft SMTPSVC(5.0.2195.1600)
