In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Environment Setup

In [None]:
# Colab setup if on colab
try:
    from google.colab import drive
    mount_point = "/content/gdrive"
    drive.mount(mount_point)
except:
    mount_point = ""

# data location
path = mount_point + "/My Drive/CISC452/Project/enron2/"

Mounted at /content/gdrive


In [None]:
ham = os.scandir(path + 'ham')
spam = os.scandir(path + 'spam')
df = pd.DataFrame({'Text': [], 'IsSpam': []})

for entry in ham:
    file_name = path + 'ham/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().lower()
    f.close()

    df = df.append({'Text': txt, 'IsSpam': False}, ignore_index=True)

for entry in spam:
    file_name = path + 'spam/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().lower()
    f.close()

    df = df.append({'Text': txt, 'IsSpam': True}, ignore_index=True)

df = df.sample(frac=1, random_state=20221122).reset_index(drop=True)
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=20221122)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_test.head()

Unnamed: 0,Text,IsSpam
0,subject: wallstreet pulse\ngood day to all bro...,1.0
1,subject: re : anita dupont resume\noooopppss !...,0.0
2,"subject: hiring aram at a vp level\nrick ,\ni ...",0.0
3,subject: seeking your partnership\ndear partne...,1.0
4,"subject: wharton tiger team agenda\nfriends ,\...",0.0


# Tokenizers

PyTorch default tokenizer tokenizes by full words. 

In [None]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocabulary(dataframe):
    for text in dataframe['Text']:
        yield tokenizer(text)

torch_vocab = build_vocab_from_iterator(build_vocabulary(df_train), min_freq=1, specials=["<UNK>"])

torch_vocab.set_default_index(torch_vocab["<UNK>"])

In [None]:
print(len(torch_vocab))

36274


In [None]:
print(tokenizer(df_train['Text'][0]))

['subject', 'template', 'for', 'pricing', 'the', 'right', 'of', 'first', 'refusal', 'shelley', 'and', 'chris', ',', 'i', 'have', 'set', 'up', 'a', 'template', 'for', 'pricing', 'rofrs', '.', 'the', 'rofr', 'is', 'priced', 'as', 'a', 'series', 'forward', 'start', 'options', '.', 'a', 'forward', 'start', 'option', 'gives', 'the', 'holder', 'the', 'right', 'to', 'exercise', 'the', 'option', 'but', 'the', 'strike', 'price', 'is', 'set', 'at', 'the', 'money', 'in', 'future', 'before', 'the', 'option', 'expiration', '.', 'the', 'feature', 'mimics', 'the', 'matching', 'the', 'best', 'bid', 'in', 'the', 'rofr', '.', 'the', 'underlying', 'for', 'the', 'option', 'is', 'the', 'best', 'bid', ',', 'which', 'should', 'be', 'closely', 'related', 'to', 'the', 'price', 'differential', 'between', 'the', 'two', 'hubs', 'that', 'the', 'pipeline', 'connects', '.', 'therefore', 'the', 'rofr', 'is', 'case', 'dependent', ',', 'as', 'vince', 'pointed', 'out', '.', 'the', 'volatility', 'can', 'be', 'estimated',

My custom tokenizer uses byte-pair encoding to tokenize sub-words instead of full words by freqency. This allows the tokens to better represent meanings of similar words. For example, 'email' might be tokenized as a full word, but 'emailing' might be tokenized into email-ing. 

The benefit of this approach is that it vastly decreases the vocabulary size, and thus the dimensionality, of the input data (on this dataset, is method provides a vocabulary of between 1000 and 1100 unique tokens), while still preserving more information than a naive character-delimited tokenization method would provide. 

The drawback is that rare words such as names might be tokenized rather meaninglessly (for example, deathridge as de-a-thr-id-ge). 

The Tokenizer class takes two arguments - the max vocabulary size `vocab_size` and the minimum frequency for byte-pair tokenization `min_freq`. The training ends when either vocabulary size reaches `vocab_size`, or the most common byte-pair is less frequent than `min_freq`.

In [None]:
class Tokenizer(object):
    def __init__(self, vocab_size=2048, min_freq=500):
        if vocab_size < 1:
            self.vocab_size = 2048
        else:
            self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.encoding = {}
        self.decoding = {}
        self.vocab = set()
    
    # Generate an encoding mapping from a dataset
    def learn(self, train):
        # Generate a corpus of words from the training set
        corpus = {}
        for i, row in train.iterrows():
            txt = row['Text'].replace('\n', ' ')
            dat = txt.split(' ')
            # Adds each word to the corpus (around 33000 words in the enron2 set)
            for word in dat:
                # Append an End-Of-Word byte to each word. Choosing space to take advantage of existing preprocessing
                # This allows the tokenizer to distinguish between similar tokens 
                # like the 'ed' in 'learned' and the 'ed' in 'education'

                # Note: The dataset is too small for this to matter.
                # It actually results in lower-quality tokenization because it leaves pairs that would otherwise be common enough to compress.
                # Therefore, this step is abandoned.
                wordw = word # + ' '

                if wordw in corpus.keys():
                    corpus[wordw] += 1
                else:
                    corpus[wordw] = 1
        
        print("Corpus size: ", len(corpus))
        # Populate the vocabulary with initial characters
        for word in corpus.keys():
            for ch in word:
                self.vocab.add(ch)
        
        unused_byte = 256  # the max ord of any char in the dataset is 254, so any char past 256 is unused

        # Compress byte-pairs until the vocab size exceeds the allowed limit
        while len(self.vocab) < self.vocab_size:
            # Counts the occurences of all byte-pairs in the corpus
            byte_pairs = {}
            for word in corpus.keys():
                for i in range(len(word) - 1):
                    pair = word[i:i+2]
                    if pair in byte_pairs.keys():
                        byte_pairs[pair] += corpus[word]
                    else:
                        byte_pairs[pair] = corpus[word]
            
            # Gets the most frequent byte pair
            most_frequent = max(byte_pairs, key=byte_pairs.get)
            # Ends the loop if the most frequent byte pair is rarer than allowed
            if byte_pairs[most_frequent] <= self.min_freq:
                break
            
            # Adds the most frequent byte pair to the encoding
            self.encoding[most_frequent] = chr(unused_byte)
            self.vocab.add(chr(unused_byte))
            # Applies the encoding to the corpus
            for word in corpus.keys():
                corpus[word.replace(most_frequent, chr(unused_byte))] = corpus.pop(word)
            # Increment the ord of the unused byte
            unused_byte += 1
        
        # Generate the decoding mapping from the encoding mapping
        self.decoding = dict((v,k) for k,v in reversed(list(self.encoding.items())))
        print("Vocab size: ", len(self.vocab))
    
    # Tokenize a single string of text
    def tokenize(self, txt):
        result = txt.replace('\n', ' ')
        for key in self.encoding.keys():
            result = result.replace(key, self.encoding[key])
        
        return result
    
    # Add a tokenized row to an existing pandas df
    def tokenize_all(self, df, text_column='Text'):
        df['CustomTokenizer'] = df[text_column].apply(lambda row: self.tokenize(row))
    
    # Decode a single tokenized string (bars=False if you only want the raw decoded text)
    def decode(self, tokenized, bars=True):
        if bars:
            txt = tokenized.replace('', '|')
        for key in self.decoding.keys():
            txt = txt.replace(key, self.decoding[key])
        return txt

In [None]:
custom_tokenizer = Tokenizer()
custom_tokenizer.learn(df_train)

Corpus size:  36325
Vocab size:  1066


In [None]:
df_train['DefaultTokenizer'] = df_train['Text'].apply(lambda text: tokenizer(text))
df_test['DefaultTokenizer'] = df_test['Text'].apply(lambda text: tokenizer(text))
custom_tokenizer.tokenize_all(df_train)
custom_tokenizer.tokenize_all(df_test)
df_train.head()

Unnamed: 0,Text,IsSpam,DefaultTokenizer,CustomTokenizer
0,subject: template for pricing the right of fir...,0.0,"[subject, template, for, pricing, the, right, ...","ƈ ŀmƥŐ Ğ ħěƪ Ċ Ѕ ę ͹ ăfńď ĸǨȏ Ė ĤǏ , i Ŭ ̰ ƾ a..."
1,"subject: new resume\ndear vince ,\ni am so gra...",0.0,"[subject, new, resume, dear, vince, ,, i, am, ...","ƈ Ǘ ϋ ο vłĢ , i ģ Ɔ ųŐʖ Ğ Ō ȬĞȕ . i ăȌ űǺƧŐ Ĝ ..."
2,subject: easily lose weight / build muscle / r...,1.0,"[subject, easily, lose, weight, /, build, musc...",ƈ eĒĝy ĬĨ ĭȗ / ˖ĝd mńcĘ / ăŁĨ Ņƪ ! 2ǻ31 Ē Ĩć Ă...
3,subject: re : university of texas conference o...,0.0,"[subject, re, university, of, texas, conferenc...","ƈ ă : Ͷ ę ŀxĒ ̡ Ă ɂ fłǡ , ƇȫҜ ǟ vłĢ , i ģ ͮțƪ ..."
4,subject: re : replied resume\nvince / sally\ni...,0.0,"[subject, re, replied, resume, vince, /, sally...",ƈ ă : ăpƅđ ϋ vłĢ / sȌ ē űƠĎs Ř Ċ Şpƅđ Ш Ă ŉ Ɓ ...


Visualization for the tokenizer's results + sanity check

In [None]:
row = 0
sanity = df_test['Text'][row]
tokenized = custom_tokenizer.tokenize(sanity)
print(sanity)
print('spam' if df['IsSpam'][row] else 'not spam')
print(tokenized)
tokenized = tokenized.replace('','|')
for key in custom_tokenizer.decoding.keys():
    tokenized = tokenized.replace(key, custom_tokenizer.decoding[key])
print(tokenized)

subject: wallstreet pulse
good day to all broker ' s , day trader ' s and investor ' s world s . tock report
has become famous with some great stoc ? k picks in the otc , small cap
market ' s ! ! ! ! ! ! ! ! ! ! here at world stoc ? k report we work on what we here
from the street . rumor ' s circulating and keeping the focus on the company ' s
news . we pick our companies based on there growth potential . we focus on
stoc ? ks that have great potential to move up in price ! ! ! while giving
you liquitity .
our latest pick is cdgt .
sy , mbol : cdgt
current price : $ 3 . 90
short term 7 day projection : $ 8 - 9
we give it to you again as a gift . this company is doing incredible things .
thay have cash and have made great strategic aquisitions .
current price $ 3 . 85 to $ 4 . 00 . word on the sreet is strong buy .
this company has dropped big new ' s in the past .
who ' s to say they don ' t have another big one .
* * * * * * * * * * * * * press release * * * * * * * * * * * * * * * *

# Additional Preparation

Transforming the pandas dfs into pytorch datasets

In [None]:
import torch

vocab_list = list(custom_tokenizer.vocab)
custom_vocab_dict = {vocab_list[i]: i for i in range(len(vocab_list))}

class DataFromDF(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer_type, max_tokens=200):
        self.y = torch.tensor(dataframe['IsSpam'], dtype=torch.long)
        # Using index representations as a middleman for text vectorization
        vocab_dict = custom_vocab_dict if tokenizer_type == 'CustomTokenizer' else torch_vocab

        self.x = torch.tensor(dataframe[tokenizer_type].apply(
            lambda text: [vocab_dict[ch] if ch in vocab_dict else 0 for ch in text]).apply(
                lambda tokens: tokens[:max_tokens] if len(tokens) >= max_tokens else tokens+([0]* (max_tokens-len(tokens)))), dtype=torch.int)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [None]:
default_train = DataFromDF(df_train, 'DefaultTokenizer')
default_test = DataFromDF(df_test, 'DefaultTokenizer')
custom_train = DataFromDF(df_train, 'CustomTokenizer')
custom_test = DataFromDF(df_test, 'CustomTokenizer')
print(default_train.x)
print(custom_train.x)

tensor([[   24,  4037,    15,  ...,     0,     0,     0],
        [   24,    84,   254,  ...,     0,     0,     0],
        [   24,  1814,  1577,  ...,     0,     0,     0],
        ...,
        [   24,  9563, 31795,  ...,     0,     0,     0],
        [   24,  3745,   188,  ...,   511,   125,   199],
        [   24,    59,    17,  ...,    35,   579,   112]], dtype=torch.int32)
tensor([[ 594,    0,  672,  ...,    0,  390,    0],
        [ 594,    0, 1025,  ..., 1020, 1020,  603],
        [ 594,    0,    5,  ...,    0,  171,  937],
        ...,
        [ 594,    0,  901,  ...,  779,  568,    0],
        [ 594,    0,  312,  ...,  888,    0,  769],
        [ 594,    0,  312,  ...,  551,    5,    6]], dtype=torch.int32)


Data loading

In [None]:
from torch.utils.data import DataLoader
# Change here to define a batch size
n_batches = 10
train_loader_def = DataLoader(default_train, batch_size=n_batches)
test_loader_def  = DataLoader(default_test, batch_size=n_batches)
train_loader_cus = DataLoader(custom_train, batch_size=n_batches)
test_loader_cus  = DataLoader(custom_test, batch_size=n_batches)

In [None]:
for x, y in train_loader_def:
    print(x.shape, y.shape)
    break

torch.Size([10, 200]) torch.Size([10])


# Prediction

PyTorch-based RNN models

In [None]:
from torch import nn
from torch.nn import functional as F

embed_len = 50
hidden_dim = 50
n_layers=1

custom_vocab = list(custom_tokenizer.vocab)
target_classes = ['ham', 'spam']

class RNNClassifier(nn.Module):
    def __init__(self, vocab):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, len(target_classes))

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(n_layers, len(X_batch), hidden_dim))
        return self.linear(output[:,-1])

In [None]:
default_model = RNNClassifier(torch_vocab)

default_model

RNNClassifier(
  (embedding_layer): Embedding(36274, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

In [None]:
for layer in default_model.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(36274, 50)
Parameters : 
torch.Size([36274, 50])

Layer : RNN(50, 50, batch_first=True)
Parameters : 
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=2, bias=True)
Parameters : 
torch.Size([2, 50])
torch.Size([2])



In [None]:
custom_model = RNNClassifier(custom_vocab)

custom_model

RNNClassifier(
  (embedding_layer): Embedding(1066, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

In [None]:
for layer in custom_model.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(1066, 50)
Parameters : 
torch.Size([1066, 50])

Layer : RNN(50, 50, batch_first=True)
Parameters : 
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=2, bias=True)
Parameters : 
torch.Size([2, 50])
torch.Size([2])



In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [None]:
from torch.optim import Adam

epochs = 15
learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()
default_model = RNNClassifier(torch_vocab)
optimizer = Adam(default_model.parameters(), lr=learning_rate)

TrainModel(default_model, loss_fn, optimizer, train_loader_def, test_loader_def, epochs)

100%|██████████| 469/469 [00:17<00:00, 26.80it/s]


Train Loss : 0.543
Valid Loss : 0.539
Valid Acc  : 0.754


100%|██████████| 469/469 [00:18<00:00, 25.09it/s]


Train Loss : 0.501
Valid Loss : 0.525
Valid Acc  : 0.753


100%|██████████| 469/469 [00:22<00:00, 21.32it/s]


Train Loss : 0.458
Valid Loss : 0.537
Valid Acc  : 0.735


100%|██████████| 469/469 [00:19<00:00, 23.79it/s]


Train Loss : 0.416
Valid Loss : 0.524
Valid Acc  : 0.764


100%|██████████| 469/469 [00:19<00:00, 24.39it/s]


Train Loss : 0.383
Valid Loss : 0.523
Valid Acc  : 0.767


100%|██████████| 469/469 [00:18<00:00, 25.17it/s]


Train Loss : 0.373
Valid Loss : 0.534
Valid Acc  : 0.763


100%|██████████| 469/469 [00:18<00:00, 25.98it/s]


Train Loss : 0.341
Valid Loss : 0.540
Valid Acc  : 0.776


100%|██████████| 469/469 [00:16<00:00, 27.86it/s]


Train Loss : 0.335
Valid Loss : 0.542
Valid Acc  : 0.776


100%|██████████| 469/469 [00:18<00:00, 25.72it/s]


Train Loss : 0.317
Valid Loss : 0.552
Valid Acc  : 0.780


100%|██████████| 469/469 [00:17<00:00, 26.87it/s]


Train Loss : 0.310
Valid Loss : 0.552
Valid Acc  : 0.781


100%|██████████| 469/469 [00:17<00:00, 27.27it/s]


Train Loss : 0.333
Valid Loss : 0.594
Valid Acc  : 0.747


100%|██████████| 469/469 [00:15<00:00, 29.82it/s]


Train Loss : 0.319
Valid Loss : 0.572
Valid Acc  : 0.772


100%|██████████| 469/469 [00:16<00:00, 27.82it/s]


Train Loss : 0.342
Valid Loss : 0.557
Valid Acc  : 0.755


100%|██████████| 469/469 [00:17<00:00, 27.10it/s]


Train Loss : 0.323
Valid Loss : 0.559
Valid Acc  : 0.769


100%|██████████| 469/469 [00:16<00:00, 28.11it/s]


Train Loss : 0.310
Valid Loss : 0.565
Valid Acc  : 0.769


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

    return Y_shuffled.detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy()

Y_actual, Y_preds = MakePredictions(default_model, test_loader_def)

print("---MODEL WITH DEFAULT TOKENIZER---")
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

---MODEL WITH DEFAULT TOKENIZER---
Test Accuracy : 0.7687713310580204

Classification Report : 
              precision    recall  f1-score   support

         ham       0.77      0.97      0.86       857
        spam       0.73      0.22      0.34       315

    accuracy                           0.77      1172
   macro avg       0.75      0.60      0.60      1172
weighted avg       0.76      0.77      0.72      1172


Confusion Matrix : 
[[831  26]
 [245  70]]


In [None]:
custom_model = RNNClassifier(custom_vocab)
optimizer = Adam(custom_model.parameters(), lr=learning_rate)

TrainModel(custom_model, loss_fn, optimizer, train_loader_cus, test_loader_cus, epochs)

100%|██████████| 469/469 [00:08<00:00, 55.64it/s]


Train Loss : 0.563
Valid Loss : 0.570
Valid Acc  : 0.734


100%|██████████| 469/469 [00:07<00:00, 59.37it/s]


Train Loss : 0.520
Valid Loss : 0.551
Valid Acc  : 0.737


100%|██████████| 469/469 [00:08<00:00, 53.91it/s]


Train Loss : 0.480
Valid Loss : 0.485
Valid Acc  : 0.788


100%|██████████| 469/469 [00:08<00:00, 55.80it/s]


Train Loss : 0.440
Valid Loss : 0.510
Valid Acc  : 0.778


100%|██████████| 469/469 [00:08<00:00, 57.65it/s]


Train Loss : 0.501
Valid Loss : 0.534
Valid Acc  : 0.759


100%|██████████| 469/469 [00:08<00:00, 56.77it/s]


Train Loss : 0.425
Valid Loss : 0.533
Valid Acc  : 0.771


100%|██████████| 469/469 [00:08<00:00, 56.57it/s]


Train Loss : 0.391
Valid Loss : 0.533
Valid Acc  : 0.799


100%|██████████| 469/469 [00:08<00:00, 53.68it/s]


Train Loss : 0.345
Valid Loss : 0.493
Valid Acc  : 0.798


100%|██████████| 469/469 [00:09<00:00, 48.78it/s]


Train Loss : 0.316
Valid Loss : 0.476
Valid Acc  : 0.803


100%|██████████| 469/469 [00:08<00:00, 56.04it/s]


Train Loss : 0.295
Valid Loss : 0.476
Valid Acc  : 0.824


100%|██████████| 469/469 [00:08<00:00, 56.74it/s]


Train Loss : 0.270
Valid Loss : 0.599
Valid Acc  : 0.777


100%|██████████| 469/469 [00:08<00:00, 56.40it/s]


Train Loss : 0.296
Valid Loss : 0.450
Valid Acc  : 0.828


100%|██████████| 469/469 [00:08<00:00, 53.85it/s]


Train Loss : 0.353
Valid Loss : 0.492
Valid Acc  : 0.816


100%|██████████| 469/469 [00:08<00:00, 55.80it/s]


Train Loss : 0.263
Valid Loss : 0.477
Valid Acc  : 0.837


100%|██████████| 469/469 [00:08<00:00, 53.65it/s]


Train Loss : 0.211
Valid Loss : 0.476
Valid Acc  : 0.832


In [None]:
Y_actual, Y_preds = MakePredictions(custom_model, test_loader_cus)

print("---MODEL WITH CUSTOM TOKENIZER---")
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

---MODEL WITH CUSTOM TOKENIZER---
Test Accuracy : 0.8327645051194539

Classification Report : 
              precision    recall  f1-score   support

         ham       0.89      0.88      0.88       857
        spam       0.68      0.70      0.69       315

    accuracy                           0.83      1172
   macro avg       0.79      0.79      0.79      1172
weighted avg       0.83      0.83      0.83      1172


Confusion Matrix : 
[[754 103]
 [ 93 222]]


# Sanity check with the enron1 set

In [None]:
path1 = path.replace('enron2', 'enron1')
ham = os.scandir(path1 + 'ham')
spam = os.scandir(path1 + 'spam')
df1 = pd.DataFrame({'Text': [], 'IsSpam': []})

for entry in ham:
    file_name = path1 + 'ham/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().lower()
    f.close()

    df1 = df1.append({'Text': txt, 'IsSpam': False}, ignore_index=True)

for entry in spam:
    file_name = path1 + 'spam/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().lower()
    f.close()

    df1 = df1.append({'Text': txt, 'IsSpam': True}, ignore_index=True)

df1 = df1.sample(frac=1).reset_index(drop=True)
df1.head()

Unnamed: 0,Text,IsSpam
0,"subject: hpl nom for december 22 , 2000\n( see...",0.0
1,subject: eol application id and password\ndarr...,0.0
2,subject: re : meter 984229 - roos common point...,0.0
3,subject: info\nneh b 27 q 71 tojlmjuob 2 wj jl...,1.0
4,subject: brand new teenager peeing\nyou don ' ...,1.0


In [None]:
custom_tokenizer.tokenize_all(df1)
df1['DefaultTokenizer'] = df1['Text'].apply(lambda text: tokenizer(text))

enron1_default = DataFromDF(df1, 'DefaultTokenizer')
enron1_custom = DataFromDF(df1, 'CustomTokenizer')

df1.head()

Unnamed: 0,Text,IsSpam,CustomTokenizer,DefaultTokenizer
0,"subject: hpl nom for december 22 , 2000\n( see...",0.0,"ƈ hƥ nė Ğ ĩĢʰ Ж , ơ ( ʊ ΰ fˑ : hƥnl Ж2 . xls )...","[subject, hpl, nom, for, december, 22, ,, 2000..."
1,subject: eol application id and password\ndarr...,0.0,"ƈ eŃ űƥѱ Œ Ė pȡƒd dĎƐ , Ō Œ Ė pȡƒd Ğ eŃ űƥѱ Č ...","[subject, eol, application, id, and, password,..."
2,subject: re : meter 984229 - roos common point...,0.0,ƈ ă : Ҩą 9842͍ - ƜЕ ǓĂ Šłt - ̭ĩ zƁ ̸ vǡ : ƶ ŵ ...,"[subject, re, meter, 984229, -, roos, common, ..."
3,subject: info\nneh b 27 q 71 tojlmjuob 2 wj jl...,1.0,ƈ łǲ œh b ΅ q 71 ċjlmjuҝ 2 wj jlƜ 87 d 2 452 p...,"[subject, info, neh, b, 27, q, 71, tojlmjuob, ..."
4,subject: brand new teenager peeing\nyou don ' ...,1.0,ƈ ȫĖ Ǘ ŀćŅą Ơeƪ Ĝ ɠ ' t Ǫ Ĳ Ź Ĵģ . : ) iyʄubĂĄ...,"[subject, brand, new, teenager, peeing, you, d..."


In [None]:
enron1_loader_def = DataLoader(enron1_default, batch_size=n_batches)
enron1_loader_cus  = DataLoader(enron1_custom, batch_size=n_batches)

In [None]:
Y_actual, Y_preds = MakePredictions(default_model, enron1_loader_def)

print("---MODEL WITH CUSTOM TOKENIZER---")
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

---MODEL WITH CUSTOM TOKENIZER---
Test Accuracy : 0.715583913379737

Classification Report : 
              precision    recall  f1-score   support

         ham       0.72      0.98      0.83      3672
        spam       0.59      0.07      0.12      1500

    accuracy                           0.72      5172
   macro avg       0.65      0.52      0.47      5172
weighted avg       0.68      0.72      0.62      5172


Confusion Matrix : 
[[3602   70]
 [1401   99]]


In [None]:
Y_actual, Y_preds = MakePredictions(custom_model, enron1_loader_cus)

print("---MODEL WITH CUSTOM TOKENIZER---")
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

---MODEL WITH CUSTOM TOKENIZER---
Test Accuracy : 0.7097834493426141

Classification Report : 
              precision    recall  f1-score   support

         ham       0.82      0.76      0.79      3672
        spam       0.50      0.59      0.54      1500

    accuracy                           0.71      5172
   macro avg       0.66      0.67      0.66      5172
weighted avg       0.73      0.71      0.72      5172


Confusion Matrix : 
[[2785  887]
 [ 614  886]]
