In [1]:
# Required packages:
from typing import Any, List, Tuple, Union
from numpy import ndarray

# For preprocessing text
import string
import re
from nltk.corpus import stopwords

# For training the model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# For evaluating the model
from sklearn import metrics

In [2]:
def load_data(path: str) -> Tuple[List[str], List[str]]:
    """Loads data from file. Each except first (header) is a datapoint
    containing ID, Label, Email (content) separated by "\t".

    Args:
        path: Path to file from which to load data

    Returns:
        List of email contents and a list of labels coresponding to each email.
    """

    emails = []
    labels = []

    with open (path, "r", encoding="mbcs") as file:
        data = file.readlines()
        for line in data[1:]:
            line = line.split("\t")     # Split the line by withespaces
            emails.append(line[2])      # Append the content of the email

            # If the label is "spam" --> append 1, else append 0
            if line[1] == "spam":
                labels.append(1)
            else:
                labels.append(0)

    return emails, labels

In [3]:
def preprocess(doc: str) -> str:
    """Preprocesses text to prepare it for feature extraction.

    Args:
        doc: String comprising the unprocessed contents of some email file.

    Returns:
        String comprising the corresponding preprocessed text.
    """
    
    ## TOKENIZATION ##

    # Remove HTML markup using a regular expression.
    #html_regex = re.compile("<[^>]+>")
    #doc = html_regex.sub(" ", doc)

    
    # Replace puntuation using regular expresion
    # Replace punctuation marks with spaces.
    
    doc = re.sub("[^a-z0-9]", " ", doc)
    
    #for punct_mark in string.punctuation:
    #    doc = doc.replace(punct_mark, " ")
    
    # Split by whitespaces and turn to lowercase.
    doc = doc.lower().split()


    ## STOPWORD REMOVAL ##

    #str_doc = ""
    
    stop_words = stopwords.words("english")
    doc = [word for word in doc if word not in stop_words]
    
    #for word in doc:
    #    if word not in stop_words:
    #        str_doc = str_doc + " " + word

    ## SUFFIX-S STEMMING ## 

    #stemmed_doc = ""

    #for word in doc:
    #    stemmed_word = word[:-1] if word[-1] == "s" else word
    #    stemmed_doc = stemmed_doc + " " + stemmed_word

    #print(stemmed_doc)

    #return stemmed_doc
    return "".join(doc)

In [4]:
def preprocess_multiple(docs: List[str]) -> List[str]:
    """Preprocesses multiple texts to prepare them for feature extraction.

    Args:
        docs: List of strings, each consisting of the unprocessed contents
            of some email file.

    Returns:
        List of strings, each comprising the corresponding preprocessed
            text.
    """
    
    preprocessed_docs = []
    count = 0

    for doc in docs:
        preprocessed_docs.append(preprocess(doc))
        count += 1
        print("Doc preprocessed: ", count)

    return preprocessed_docs

In [5]:
def extract_features(
    train_dataset: List[str], test_dataset: List[str]
) -> Tuple[ndarray, ndarray]:
    """Extracts feature vectors from a preprocessed train and test datasets.

    Args:
        train_dataset: List of strings, each consisting of the preprocessed
            email content.
        test_dataset: List of strings, each consisting of the preprocessed
            email content.

    Returns:

    """
    ## EXTRACT VOCABULARY ##
    vocabulary = []

    for doc in train_dataset:
        doc_terms = doc.split() # Split document by whitespaces
        for term in doc_terms:
            if term not in vocabulary: 
                vocabulary.append(term)


    ## CREATION OF THE DOCUMENT-TERM MATRIX ##
    train_doc_term_matrix = ndarray(shape=(len(train_dataset), len(vocabulary)), dtype=int)

    for i in range(len(vocabulary)):
        for j in range(len(train_dataset)):
            train_doc_term_matrix[j][i] = doc.split().count(vocabulary[i])

    # We need to do the same for the document-term matrix of the test split
    test_doc_term_matrix = ndarray(shape=(len(test_dataset), len(vocabulary)), dtype=int)

    for i in range(len(vocabulary)):
        for j in range(len(test_dataset)):
            test_doc_term_matrix[j][i] = doc.split().count(vocabulary[i])

    return tuple((train_doc_term_matrix, test_doc_term_matrix))

In [6]:
def train(X: ndarray, y: List[int]) -> object:
    """Trains a classifier on extracted feature vectors.

    Args:
        X: Numerical array-like object (2D) representing the instances.
        y: Numerical array-like object (1D) representing the labels.

    Returns:
        A trained model object capable of predicting over unseen sets of
            instances.
    """

    # Model pipeline
    model = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])

    # Train the model
    model.fit(X, y)

    return model

In [7]:
def evaluate(
    y: List[int], y_pred: List[int]
) -> Tuple[float, float, float, float]:
    """Evaluates a model's predictive performance with respect to a labeled
    dataset.

    Args:
        y: Numerical array-like object (1D) representing the true labels.
        y_pred: Numerical array-like object (1D) representing the predicted
            labels.

    Returns:
        A tuple of four values: recall, precision, F_1, and accuracy.
    """

    rec = metrics.recall_score(y, y_pred)
    prec = metrics.precision_score(y, y_pred)
    f1 = metrics.f1_score(y, y_pred)
    acc = metrics.accuracy_score(y, y_pred)

    return rec, prec, f1, acc


In [None]:
print("Loading data...")
train_data_raw, train_labels = load_data("data/train.tsv")
test_data_raw, test_labels = load_data("data/test.tsv")

print("Processing data...")
train_data = preprocess_multiple(train_data_raw)
test_data = preprocess_multiple(test_data_raw)

print("Extracting features...")
train_feature_vectors, test_feature_vectors = extract_features(train_data, test_data)

print("Training...")
classifier = train(train_feature_vectors, train_labels)

print("Applying model on test data...")
predicted_labels = classifier.predict(test_feature_vectors)

print("Evaluating")
recall, precision, f1, accuracy = evaluate(test_labels, predicted_labels)

print(f"Recall:\t{recall}")
print(f"Precision:\t{precision}")
print(f"F1:\t{f1}")
print(f"Accuracy:\t{accuracy}")

Loading data...
Processing data...
Doc preprocessed:  1
Doc preprocessed:  2
Doc preprocessed:  3
Doc preprocessed:  4
Doc preprocessed:  5
Doc preprocessed:  6
Doc preprocessed:  7
Doc preprocessed:  8
Doc preprocessed:  9
Doc preprocessed:  10
Doc preprocessed:  11
Doc preprocessed:  12
Doc preprocessed:  13
Doc preprocessed:  14
Doc preprocessed:  15
Doc preprocessed:  16
Doc preprocessed:  17
Doc preprocessed:  18
Doc preprocessed:  19
Doc preprocessed:  20
Doc preprocessed:  21
Doc preprocessed:  22
Doc preprocessed:  23
Doc preprocessed:  24
Doc preprocessed:  25
Doc preprocessed:  26
Doc preprocessed:  27
Doc preprocessed:  28
Doc preprocessed:  29
Doc preprocessed:  30
Doc preprocessed:  31
Doc preprocessed:  32
Doc preprocessed:  33
Doc preprocessed:  34
Doc preprocessed:  35
Doc preprocessed:  36
Doc preprocessed:  37
Doc preprocessed:  38
Doc preprocessed:  39
Doc preprocessed:  40
Doc preprocessed:  41
Doc preprocessed:  42
Doc preprocessed:  43
Doc preprocessed:  44
Doc pr

Doc preprocessed:  363
Doc preprocessed:  364
Doc preprocessed:  365
Doc preprocessed:  366
Doc preprocessed:  367
Doc preprocessed:  368
Doc preprocessed:  369
Doc preprocessed:  370
Doc preprocessed:  371
Doc preprocessed:  372
Doc preprocessed:  373
Doc preprocessed:  374
Doc preprocessed:  375
Doc preprocessed:  376
Doc preprocessed:  377
Doc preprocessed:  378
Doc preprocessed:  379
Doc preprocessed:  380
Doc preprocessed:  381
Doc preprocessed:  382
Doc preprocessed:  383
Doc preprocessed:  384
Doc preprocessed:  385
Doc preprocessed:  386
Doc preprocessed:  387
Doc preprocessed:  388
Doc preprocessed:  389
Doc preprocessed:  390
Doc preprocessed:  391
Doc preprocessed:  392
Doc preprocessed:  393
Doc preprocessed:  394
Doc preprocessed:  395
Doc preprocessed:  396
Doc preprocessed:  397
Doc preprocessed:  398
Doc preprocessed:  399
Doc preprocessed:  400
Doc preprocessed:  401
Doc preprocessed:  402
Doc preprocessed:  403
Doc preprocessed:  404
Doc preprocessed:  405
Doc preproc

Doc preprocessed:  730
Doc preprocessed:  731
Doc preprocessed:  732
Doc preprocessed:  733
Doc preprocessed:  734
Doc preprocessed:  735
Doc preprocessed:  736
Doc preprocessed:  737
Doc preprocessed:  738
Doc preprocessed:  739
Doc preprocessed:  740
Doc preprocessed:  741
Doc preprocessed:  742
Doc preprocessed:  743
Doc preprocessed:  744
Doc preprocessed:  745
Doc preprocessed:  746
Doc preprocessed:  747
Doc preprocessed:  748
Doc preprocessed:  749
Doc preprocessed:  750
Doc preprocessed:  751
Doc preprocessed:  752
Doc preprocessed:  753
Doc preprocessed:  754
Doc preprocessed:  755
Doc preprocessed:  756
Doc preprocessed:  757
Doc preprocessed:  758
Doc preprocessed:  759
Doc preprocessed:  760
Doc preprocessed:  761
Doc preprocessed:  762
Doc preprocessed:  763
Doc preprocessed:  764
Doc preprocessed:  765
Doc preprocessed:  766
Doc preprocessed:  767
Doc preprocessed:  768
Doc preprocessed:  769
Doc preprocessed:  770
Doc preprocessed:  771
Doc preprocessed:  772
Doc preproc

Doc preprocessed:  1096
Doc preprocessed:  1097
Doc preprocessed:  1098
Doc preprocessed:  1099
Doc preprocessed:  1100
Doc preprocessed:  1101
Doc preprocessed:  1102
Doc preprocessed:  1103
Doc preprocessed:  1104
Doc preprocessed:  1105
Doc preprocessed:  1106
Doc preprocessed:  1107
Doc preprocessed:  1108
Doc preprocessed:  1109
Doc preprocessed:  1110
Doc preprocessed:  1111
Doc preprocessed:  1112
Doc preprocessed:  1113
Doc preprocessed:  1114
Doc preprocessed:  1115
Doc preprocessed:  1116
Doc preprocessed:  1117
Doc preprocessed:  1118
Doc preprocessed:  1119
Doc preprocessed:  1120
Doc preprocessed:  1121
Doc preprocessed:  1122
Doc preprocessed:  1123
Doc preprocessed:  1124
Doc preprocessed:  1125
Doc preprocessed:  1126
Doc preprocessed:  1127
Doc preprocessed:  1128
Doc preprocessed:  1129
Doc preprocessed:  1130
Doc preprocessed:  1131
Doc preprocessed:  1132
Doc preprocessed:  1133
Doc preprocessed:  1134
Doc preprocessed:  1135
Doc preprocessed:  1136
Doc preprocessed

Doc preprocessed:  1458
Doc preprocessed:  1459
Doc preprocessed:  1460
Doc preprocessed:  1461
Doc preprocessed:  1462
Doc preprocessed:  1463
Doc preprocessed:  1464
Doc preprocessed:  1465
Doc preprocessed:  1466
Doc preprocessed:  1467
Doc preprocessed:  1468
Doc preprocessed:  1469
Doc preprocessed:  1470
Doc preprocessed:  1471
Doc preprocessed:  1472
Doc preprocessed:  1473
Doc preprocessed:  1474
Doc preprocessed:  1475
Doc preprocessed:  1476
Doc preprocessed:  1477
Doc preprocessed:  1478
Doc preprocessed:  1479
Doc preprocessed:  1480
Doc preprocessed:  1481
Doc preprocessed:  1482
Doc preprocessed:  1483
Doc preprocessed:  1484
Doc preprocessed:  1485
Doc preprocessed:  1486
Doc preprocessed:  1487
Doc preprocessed:  1488
Doc preprocessed:  1489
Doc preprocessed:  1490
Doc preprocessed:  1491
Doc preprocessed:  1492
Doc preprocessed:  1493
Doc preprocessed:  1494
Doc preprocessed:  1495
Doc preprocessed:  1496
Doc preprocessed:  1497
Doc preprocessed:  1498
Doc preprocessed

Doc preprocessed:  1827
Doc preprocessed:  1828
Doc preprocessed:  1829
Doc preprocessed:  1830
Doc preprocessed:  1831
Doc preprocessed:  1832
Doc preprocessed:  1833
Doc preprocessed:  1834
Doc preprocessed:  1835
Doc preprocessed:  1836
Doc preprocessed:  1837
Doc preprocessed:  1838
Doc preprocessed:  1839
Doc preprocessed:  1840
Doc preprocessed:  1841
Doc preprocessed:  1842
Doc preprocessed:  1843
Doc preprocessed:  1844
Doc preprocessed:  1845
Doc preprocessed:  1846
Doc preprocessed:  1847
Doc preprocessed:  1848
Doc preprocessed:  1849
Doc preprocessed:  1850
Doc preprocessed:  1851
Doc preprocessed:  1852
Doc preprocessed:  1853
Doc preprocessed:  1854
Doc preprocessed:  1855
Doc preprocessed:  1856
Doc preprocessed:  1857
Doc preprocessed:  1858
Doc preprocessed:  1859
Doc preprocessed:  1860
Doc preprocessed:  1861
Doc preprocessed:  1862
Doc preprocessed:  1863
Doc preprocessed:  1864
Doc preprocessed:  1865
Doc preprocessed:  1866
Doc preprocessed:  1867
Doc preprocessed

Doc preprocessed:  2174
Doc preprocessed:  2175
Doc preprocessed:  2176
Doc preprocessed:  2177
Doc preprocessed:  2178
Doc preprocessed:  2179
Doc preprocessed:  2180
Doc preprocessed:  2181
Doc preprocessed:  2182
Doc preprocessed:  2183
Doc preprocessed:  2184
Doc preprocessed:  2185
Doc preprocessed:  2186
Doc preprocessed:  2187
Doc preprocessed:  2188
Doc preprocessed:  2189
Doc preprocessed:  2190
Doc preprocessed:  2191
Doc preprocessed:  2192
Doc preprocessed:  2193
Doc preprocessed:  2194
Doc preprocessed:  2195
Doc preprocessed:  2196
Doc preprocessed:  2197
Doc preprocessed:  2198
Doc preprocessed:  2199
Doc preprocessed:  2200
Doc preprocessed:  2201
Doc preprocessed:  2202
Doc preprocessed:  2203
Doc preprocessed:  2204
Doc preprocessed:  2205
Doc preprocessed:  2206
Doc preprocessed:  2207
Doc preprocessed:  2208
Doc preprocessed:  2209
Doc preprocessed:  2210
Doc preprocessed:  2211
Doc preprocessed:  2212
Doc preprocessed:  2213
Doc preprocessed:  2214
Doc preprocessed

Doc preprocessed:  2536
Doc preprocessed:  2537
Doc preprocessed:  2538
Doc preprocessed:  2539
Doc preprocessed:  2540
Doc preprocessed:  2541
Doc preprocessed:  2542
Doc preprocessed:  2543
Doc preprocessed:  2544
Doc preprocessed:  2545
Doc preprocessed:  2546
Doc preprocessed:  2547
Doc preprocessed:  2548
Doc preprocessed:  2549
Doc preprocessed:  2550
Doc preprocessed:  2551
Doc preprocessed:  2552
Doc preprocessed:  2553
Doc preprocessed:  2554
Doc preprocessed:  2555
Doc preprocessed:  2556
Doc preprocessed:  2557
Doc preprocessed:  2558
Doc preprocessed:  2559
Doc preprocessed:  2560
Doc preprocessed:  2561
Doc preprocessed:  2562
Doc preprocessed:  2563
Doc preprocessed:  2564
Doc preprocessed:  2565
Doc preprocessed:  2566
Doc preprocessed:  2567
Doc preprocessed:  2568
Doc preprocessed:  2569
Doc preprocessed:  2570
Doc preprocessed:  2571
Doc preprocessed:  2572
Doc preprocessed:  2573
Doc preprocessed:  2574
Doc preprocessed:  2575
Doc preprocessed:  2576
Doc preprocessed

Doc preprocessed:  2902
Doc preprocessed:  2903
Doc preprocessed:  2904
Doc preprocessed:  2905
Doc preprocessed:  2906
Doc preprocessed:  2907
Doc preprocessed:  2908
Doc preprocessed:  2909
Doc preprocessed:  2910
Doc preprocessed:  2911
Doc preprocessed:  2912
Doc preprocessed:  2913
Doc preprocessed:  2914
Doc preprocessed:  2915
Doc preprocessed:  2916
Doc preprocessed:  2917
Doc preprocessed:  2918
Doc preprocessed:  2919
Doc preprocessed:  2920
Doc preprocessed:  2921
Doc preprocessed:  2922
Doc preprocessed:  2923
Doc preprocessed:  2924
Doc preprocessed:  2925
Doc preprocessed:  2926
Doc preprocessed:  2927
Doc preprocessed:  2928
Doc preprocessed:  2929
Doc preprocessed:  2930
Doc preprocessed:  2931
Doc preprocessed:  2932
Doc preprocessed:  2933
Doc preprocessed:  2934
Doc preprocessed:  2935
Doc preprocessed:  2936
Doc preprocessed:  2937
Doc preprocessed:  2938
Doc preprocessed:  2939
Doc preprocessed:  2940
Doc preprocessed:  2941
Doc preprocessed:  2942
Doc preprocessed

Doc preprocessed:  3255
Doc preprocessed:  3256
Doc preprocessed:  3257
Doc preprocessed:  3258
Doc preprocessed:  3259
Doc preprocessed:  3260
Doc preprocessed:  3261
Doc preprocessed:  3262
Doc preprocessed:  3263
Doc preprocessed:  3264
Doc preprocessed:  3265
Doc preprocessed:  3266
Doc preprocessed:  3267
Doc preprocessed:  3268
Doc preprocessed:  3269
Doc preprocessed:  3270
Doc preprocessed:  3271
Doc preprocessed:  3272
Doc preprocessed:  3273
Doc preprocessed:  3274
Doc preprocessed:  3275
Doc preprocessed:  3276
Doc preprocessed:  3277
Doc preprocessed:  3278
Doc preprocessed:  3279
Doc preprocessed:  3280
Doc preprocessed:  3281
Doc preprocessed:  3282
Doc preprocessed:  3283
Doc preprocessed:  3284
Doc preprocessed:  3285
Doc preprocessed:  3286
Doc preprocessed:  3287
Doc preprocessed:  3288
Doc preprocessed:  3289
Doc preprocessed:  3290
Doc preprocessed:  3291
Doc preprocessed:  3292
Doc preprocessed:  3293
Doc preprocessed:  3294
Doc preprocessed:  3295
Doc preprocessed

Doc preprocessed:  3604
Doc preprocessed:  3605
Doc preprocessed:  3606
Doc preprocessed:  3607
Doc preprocessed:  3608
Doc preprocessed:  3609
Doc preprocessed:  3610
Doc preprocessed:  3611
Doc preprocessed:  3612
Doc preprocessed:  3613
Doc preprocessed:  3614
Doc preprocessed:  3615
Doc preprocessed:  3616
Doc preprocessed:  3617
Doc preprocessed:  3618
Doc preprocessed:  3619
Doc preprocessed:  3620
Doc preprocessed:  3621
Doc preprocessed:  3622
Doc preprocessed:  3623
Doc preprocessed:  3624
Doc preprocessed:  3625
Doc preprocessed:  3626
Doc preprocessed:  3627
Doc preprocessed:  3628
Doc preprocessed:  3629
Doc preprocessed:  3630
Doc preprocessed:  3631
Doc preprocessed:  3632
Doc preprocessed:  3633
Doc preprocessed:  3634
Doc preprocessed:  3635
Doc preprocessed:  3636
Doc preprocessed:  3637
Doc preprocessed:  3638
Doc preprocessed:  3639
Doc preprocessed:  3640
Doc preprocessed:  3641
Doc preprocessed:  3642
Doc preprocessed:  3643
Doc preprocessed:  3644
Doc preprocessed

Doc preprocessed:  3951
Doc preprocessed:  3952
Doc preprocessed:  3953
Doc preprocessed:  3954
Doc preprocessed:  3955
Doc preprocessed:  3956
Doc preprocessed:  3957
Doc preprocessed:  3958
Doc preprocessed:  3959
Doc preprocessed:  3960
Doc preprocessed:  3961
Doc preprocessed:  3962
Doc preprocessed:  3963
Doc preprocessed:  3964
Doc preprocessed:  3965
Doc preprocessed:  3966
Doc preprocessed:  3967
Doc preprocessed:  3968
Doc preprocessed:  3969
Doc preprocessed:  3970
Doc preprocessed:  3971
Doc preprocessed:  3972
Doc preprocessed:  3973
Doc preprocessed:  3974
Doc preprocessed:  3975
Doc preprocessed:  3976
Doc preprocessed:  3977
Doc preprocessed:  3978
Doc preprocessed:  3979
Doc preprocessed:  3980
Doc preprocessed:  3981
Doc preprocessed:  3982
Doc preprocessed:  3983
Doc preprocessed:  3984
Doc preprocessed:  3985
Doc preprocessed:  3986
Doc preprocessed:  3987
Doc preprocessed:  3988
Doc preprocessed:  3989
Doc preprocessed:  3990
Doc preprocessed:  3991
Doc preprocessed

Doc preprocessed:  4330
Doc preprocessed:  4331
Doc preprocessed:  4332
Doc preprocessed:  4333
Doc preprocessed:  4334
Doc preprocessed:  4335
Doc preprocessed:  4336
Doc preprocessed:  4337
Doc preprocessed:  4338
Doc preprocessed:  4339
Doc preprocessed:  4340
Doc preprocessed:  4341
Doc preprocessed:  4342
Doc preprocessed:  4343
Doc preprocessed:  4344
Doc preprocessed:  4345
Doc preprocessed:  4346
Doc preprocessed:  4347
Doc preprocessed:  4348
Doc preprocessed:  4349
Doc preprocessed:  4350
Doc preprocessed:  4351
Doc preprocessed:  4352
Doc preprocessed:  4353
Doc preprocessed:  4354
Doc preprocessed:  4355
Doc preprocessed:  4356
Doc preprocessed:  4357
Doc preprocessed:  4358
Doc preprocessed:  4359
Doc preprocessed:  4360
Doc preprocessed:  4361
Doc preprocessed:  4362
Doc preprocessed:  4363
Doc preprocessed:  4364
Doc preprocessed:  4365
Doc preprocessed:  4366
Doc preprocessed:  4367
Doc preprocessed:  4368
Doc preprocessed:  4369
Doc preprocessed:  4370
Doc preprocessed

Doc preprocessed:  4684
Doc preprocessed:  4685
Doc preprocessed:  4686
Doc preprocessed:  4687
Doc preprocessed:  4688
Doc preprocessed:  4689
Doc preprocessed:  4690
Doc preprocessed:  4691
Doc preprocessed:  4692
Doc preprocessed:  4693
Doc preprocessed:  4694
Doc preprocessed:  4695
Doc preprocessed:  4696
Doc preprocessed:  4697
Doc preprocessed:  4698
Doc preprocessed:  4699
Doc preprocessed:  4700
Doc preprocessed:  4701
Doc preprocessed:  4702
Doc preprocessed:  4703
Doc preprocessed:  4704
Doc preprocessed:  4705
Doc preprocessed:  4706
Doc preprocessed:  4707
Doc preprocessed:  4708
Doc preprocessed:  4709
Doc preprocessed:  4710
Doc preprocessed:  4711
Doc preprocessed:  4712
Doc preprocessed:  4713
Doc preprocessed:  4714
Doc preprocessed:  4715
