In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Environment Setup

Change the data location where indicated to locate the data files

In [None]:
# Colab setup if on colab
try:
    from google.colab import drive
    mount_point = "/content/gdrive"
    drive.mount(mount_point)
except:
    mount_point = ""

# data location
path = mount_point + "/My Drive/CISC452/Project/enron2/"

Mounted at /content/gdrive


In [None]:
ham = os.scandir(path + 'ham')
spam = os.scandir(path + 'spam')
df = pd.DataFrame({'Text': [], 'IsSpam': []})

for entry in ham:
    file_name = path + 'ham/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().replace('\n', ' ').lower()
    f.close()

    df = df.append({'Text': txt, 'IsSpam': False}, ignore_index=True)

for entry in spam:
    file_name = path + 'spam/' + entry.name
    f = open(file_name, mode='r', encoding='latin1')
    txt = f.read().replace('\n', ' ').lower()
    f.close()

    df = df.append({'Text': txt, 'IsSpam': True}, ignore_index=True)

df = df.sample(frac=1, random_state=20221122).reset_index(drop=True)
df.head()

Unnamed: 0,Text,IsSpam
0,"subject: at your serrvice hello , welcome to p...",1.0
1,subject: i want to mentor you - no charge this...,1.0
2,subject: mid summer flag special : free shippi...,1.0
3,subject: perfect visual solution for your busi...,1.0
4,subject: enroncredit . com - credit pricing me...,0.0


In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=20221122)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
test.head()

Unnamed: 0,Text,IsSpam
0,subject: wallstreet pulse good day to all brok...,1.0
1,subject: re : anita dupont resume oooopppss ! ...,0.0
2,"subject: hiring aram at a vp level rick , i wa...",0.0
3,subject: seeking your partnership dear partner...,1.0
4,"subject: wharton tiger team agenda friends , a...",0.0


# Tokenizer

Uses byte-pair encoding to tokenize sub-words instead of full words by freqency. This allows the tokens to better represent meanings of similar words. For example, 'email' might be tokenized as a full word, but 'emailing' might be tokenized into email-ing. 

The benefit of this approach is that it vastly decreases the vocabulary size, and thus the dimensionality, of the input data (on this dataset, is method provides a vocabulary of between 1000 and 1100 unique tokens), while still preserving more information than a naive character-delimited tokenization method would provide. 

The drawback is that rare words such as names might be tokenized rather meaninglessly (for example, deathridge as de-a-thr-id-ge). 

The Tokenizer class takes two arguments - the max vocabulary size `vocab_size` and the minimum frequency for byte-pair tokenization `min_freq`. The training ends when either vocabulary size reaches `vocab_size`, or the most common byte-pair is less frequent than `min_freq`.

In [None]:
class Tokenizer(object):
    def __init__(self, vocab_size=2048, min_freq=500):
        if vocab_size < 1:
            self.vocab_size = 2048
        else:
            self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.encoding = {}
        self.decoding = {}
        self.vocab = set()
    
    # Generate an encoding mapping from a dataset
    def learn(self, train):
        # Generate a corpus of words from the training set
        corpus = {}
        for i, row in train.iterrows():
            txt = row['Text'].replace('\n', ' ')
            dat = txt.split(' ')
            # Adds each word to the corpus (around 33000 words in the enron2 set)
            for word in dat:
                # Append an End-Of-Word byte to each word. Choosing space to take advantage of existing preprocessing
                # This allows the tokenizer to distinguish between similar tokens 
                # like the 'ed' in 'learned' and the 'ed' in 'education'
                wordw = word + ' '

                if wordw in corpus.keys():
                    corpus[wordw] += 1
                else:
                    corpus[wordw] = 1
        
        print("Corpus size: ", len(corpus))
        # Populate the vocabulary with initial characters
        for word in corpus.keys():
            for ch in word:
                self.vocab.add(ch)
        
        unused_byte = 256  # the max ord of any char in the dataset is 254, so any char past 256 is unused

        # Compress byte-pairs until the vocab size exceeds the allowed limit
        while len(self.vocab) < self.vocab_size:
            # Counts the occurences of all byte-pairs in the corpus
            byte_pairs = {}
            for word in corpus.keys():
                for i in range(len(word) - 1):
                    pair = word[i:i+2]
                    if pair in byte_pairs.keys():
                        byte_pairs[pair] += corpus[word]
                    else:
                        byte_pairs[pair] = corpus[word]
            
            # Gets the most frequent byte pair
            most_frequent = max(byte_pairs, key=byte_pairs.get)
            # Ends the loop if the most frequent byte pair is rarer than allowed
            if byte_pairs[most_frequent] <= self.min_freq:
                break
            
            # Adds the most frequent byte pair to the encoding
            self.encoding[most_frequent] = chr(unused_byte)
            self.vocab.add(chr(unused_byte))
            # Applies the encoding to the corpus
            for word in corpus.keys():
                corpus[word.replace(most_frequent, chr(unused_byte))] = corpus.pop(word)
            # Increment the ord of the unused byte
            unused_byte += 1
        
        # Generate the decoding mapping from the encoding mapping
        self.decoding = dict((v,k) for k,v in reversed(list(self.encoding.items())))
        print("Vocab size: ", len(self.vocab))
    
    # Tokenize a single string of text
    def tokenize(self, txt):
        result = txt.replace('\n', ' ')
        for key in self.encoding.keys():
            result = result.replace(key, self.encoding[key])
        
        return result
    
    # Add a tokenized row to an existing pandas df
    def tokenize_all(self, df, text_column='Text'):
        df['Tokenized'] = df[text_column].apply(lambda row: self.tokenize(row))
    
    # Decode a single tokenized string (bars=False if you only want the raw decoded text)
    def decode(self, tokenized, bars=True):
        if bars:
            txt = tokenized.replace('', '|')
        for key in self.decoding.keys():
            txt = txt.replace(key, self.decoding[key])
        return txt

In [None]:
tokenizer = Tokenizer(vocab_size=2048, min_freq=256)
tokenizer.learn(train)

Corpus size:  36325
Vocab size:  1925


In [None]:
tokenizer.tokenize_all(train)
tokenizer.tokenize_all(test)
train.head()

Unnamed: 0,Text,IsSpam,Tokenized
0,subject: template for pricing the right of fir...,0.0,ƶtƁǞТńֱĄƛҸĮϸĒfŧŇŪĹɽģĵݗĐħŉԆЭʔĝtƁǞТńֱĦѭāĈĄƛĦfĻĩՙ...
1,"subject: new resume dear vince , i am so grate...",0.0,ƶɬ˩mƛвܰcƛĐħƏȋƩīeћńŶ؍ūĈħږֵТļƾȝтƛƦŶɛČάlƛĈǈļŉԆǗȩŊ...
2,subject: easily lose weight / build muscle / r...,1.0,ƶַخŔsƛǟϳĚוćmŧȍƛĚĒʹƛŨĞǃ212ՍŝŞŭđnbŻĐcbāĐcnƨĐģƀϢơ...
3,subject: re : university of texas conference o...,0.0,ƶrƛėϨĮף͈ݮđʒǲݒĐʄʯԢȦܰcƛĐħƏձȝĄƛʫƛđ٩ťā˪ȫǂŴħťĜķ̷ĞĄƛ...
4,subject: re : replied resume vince / sally it ...,0.0,ƶrƛėĒpűĠ˩mƛܰcƛĚsɎšƜɟ͹źĄƛȃpűĠv̉đůĆƛĩƞĽƛתĈǻŉϞwƛǯ...


Visualization for the tokenizer's results + sanity check

In [None]:
print('Raw:', test['Text'][1])
print('Tokenized:', test['Tokenized'][1])
print('Visualization:', tokenizer.decode(test['Tokenized'][1]))

Raw: subject: re : anita dupont resume oooopppss ! please disregard the e - mail below as i see now that shirley has already sent the job description to norma villarreal . my apologies . irma alvarez ext . 3 - 1543 - - - - - - - - - - - - - - - - - - - - - - forwarded by sheila walton / hou / ect on 08 / 07 / 2000 11 : 20 pm - - - - - - - - - - - - - - - - - - - - - - - - - - - from : sheila walton 08 / 07 / 2000 10 : 58 pm to : vince j kaminski / hou / ect @ ect cc : shirley crenshaw / hou / ect @ ect , norma villarreal / hou / ect @ ect subject : re : anita dupont resume mr . kaminski , in sheila walton ' s absence ( she is on vacation this week ) , please go ahead and have shirley crenshaw forward the job description that you would like posted to norma villarreal for handling . if you or shirley require any additional assistance before forwarding your material to norma , please feel free to contact me . thank you . irma alvarez ena hr coordinator ext . 3 - 1543 vince j kaminski 08 /