### Name: Andrew Tran
### Completed Date: 12/20/2024
### `Proof-of-Concept (PoC)` for Simple `ML` Problem - Classic `Spam` or `Ham` Email Classification

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import spacy_universal_sentence_encoder
import contractions
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import NamedTuple, List, Union

from src.utils.logging import LOGGER

In [2]:
LOGGER.info("Testing Logger")

[32m[2024-12-19 22:10:41] INFO     Testing Logger[0m


In [3]:
@dataclass
class CONSTANTS(NamedTuple):
    NUM_SPLITS = 5
    RANDOM_STATE = 2024
    RANGE_ITERATION_CROSS_VALIDATE = range(1, NUM_SPLITS + 1)

In [4]:
df = pd.read_csv("../data/spam_emails_data.csv")
df

Unnamed: 0,label,text
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...
...,...,...
193847,Ham,on escapenumber escapenumber escapenumber rob ...
193848,Spam,we have everything you need escapelong cialesc...
193849,Ham,hi quick question say i have a date variable i...
193850,Spam,thank you for your loan request which we recie...


In [5]:
df.columns = df.columns.str.capitalize()
df

Unnamed: 0,Label,Text
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...
...,...,...
193847,Ham,on escapenumber escapenumber escapenumber rob ...
193848,Spam,we have everything you need escapelong cialesc...
193849,Ham,hi quick question say i have a date variable i...
193850,Spam,thank you for your loan request which we recie...


In [6]:
y_columns = ["Label"]
print(f"Y Columns: {y_columns}")

X_columns = [col for col in df.columns if col not in set(y_columns)]
print(f"X Columns: {X_columns}")

Y Columns: ['Label']
X Columns: ['Text']


In [7]:
# for i, (text, label) in enumerate(zip(df["Text"], df["Label"]), start=1):
#     print(f"Row - {i}; Label: {label}; Text: \"{text}\"")

In [8]:
df["Label"].value_counts()

Label
Ham     102160
Spam     91692
Name: count, dtype: int64

In [9]:
K_FOLD = StratifiedKFold(n_splits=CONSTANTS.NUM_SPLITS, shuffle=True, random_state=CONSTANTS.RANDOM_STATE)
K_FOLD

StratifiedKFold(n_splits=5, random_state=2024, shuffle=True)

In [10]:
# Downloading NLTK dependenices
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

# NLTK stopwords vocabulary
nltk_en_stopwords = set(stopwords.words("english"))
print(f"STOPWORDS SET: {nltk_en_stopwords}")

nltk_word_lemmatizer = WordNetLemmatizer()
display(nltk_word_lemmatizer)

# Regex here from documentation catches all punctuation but avoids contractions (resource link provided above for reference)
nltk_regex_tokenizer = RegexpTokenizer(r'\w+')

def lemmatizer_convert_penn_treeback_parts_of_speech(pt_pos: str) -> str:
    if pt_pos.startswith("N"): # noun case
        return "n"
    elif pt_pos.startswith("J"): # adjective case
        return "a"
    elif pt_pos.startswith("V"): # verb case
        return "v"
    elif pt_pos.startswith("R"): # adverb base
        return "r"
    else: # All other cases - default to noun case
        return "n"

# Syntactically processing each passed-in column using Python NLTK (mostly) and Contraction Libaries
def nltk_process_column(dataframe: pd.DataFrame, col_name: str) -> pd.Series:
    # Assert statement to guard against potentially passing a column name in the dataframe which does not exist
    assert col_name in dataframe, f"Error: Column {col_name} not in the dataframe passed to this function!"
    
    nltk_processed_entries: List[str] = []
    for i, sentence_entry_text in enumerate(dataframe[col_name]):
        # print(f"[{i}] Old {col_name}: {sentence_entry_text}")

        # Resolving contractions by breaking them up if seen in each sentence
        sentence_entry_text_contractions_fixed: List[str] = []
        for word in str(sentence_entry_text).strip().split():
            contractions_fixed: Union[List[str], str] = contractions.fix(word)
            if isinstance(contractions_fixed, list):
                sentence_entry_text_contractions_fixed.extend(contractions_fixed)
            else:
                sentence_entry_text_contractions_fixed.append(contractions_fixed)
        
        # Splitting each sentence into words
        sentence_entry_word_tokens: List[str] = nltk_regex_tokenizer.tokenize(" ".join(sentence_entry_text_contractions_fixed).rstrip())
        # print(f"Contraction + Tokenize: {sentence_entry_word_tokens}")
        
        # Removing stopwords and lowercasing all words in each sentence
        sentence_entry_stopwords_filtered_tokens: List[str] = []
        for word in sentence_entry_word_tokens:
            if word.lower() not in nltk_en_stopwords:
                sentence_entry_stopwords_filtered_tokens.append(word.lower())
        # print(f"Stopwords + Lowercase: {sentence_entry_stopwords_filtered_tokens}")
        
        # Parts-Of-Speech Tagging and Lemmatization of all words in each sentence
        sentence_entry_lemmatized_text: List[str] = []
        sentence_entry_pos_tagged_tokens = pos_tag(sentence_entry_stopwords_filtered_tokens)
        for (word, penn_treeback_part_of_speech) in sentence_entry_pos_tagged_tokens:
            lemmatizer_part_of_speech = lemmatizer_convert_penn_treeback_parts_of_speech(penn_treeback_part_of_speech)
            sentence_entry_lemmatized_text.append(nltk_word_lemmatizer.lemmatize(word, pos=lemmatizer_part_of_speech))
        
        # Joining all NLTK-processed words in each sentence back into one string    
        final_nltk_processed_sentence = " ".join(sentence_entry_lemmatized_text).rstrip()
        # print(f"[{i}] New {col_name}: {final_nltk_processed_sentence}")
        # print("-----------------")
        nltk_processed_entries.append(final_nltk_processed_sentence)
    return pd.Series(nltk_processed_entries)

# print("-----------------")
for col_name in X_columns:
    df[col_name] = nltk_process_column(df, col_name)
df

STOPWORDS SET: {'above', 'i', "wouldn't", "you'd", "you've", 'while', 'shan', 't', 'again', 'which', 'on', 'can', 'in', "doesn't", 've', 'ourselves', 'have', 'against', 'been', 'our', "didn't", 'that', 'isn', 'couldn', 'should', 'weren', "you'll", "shan't", 'herself', 'this', 'ours', 'because', 'itself', 'to', 'few', 'why', 'haven', 'both', 'being', 'over', 'those', "mightn't", 'it', 'themselves', 'from', 'me', 'do', "don't", "couldn't", 'any', 'mightn', 'by', 'off', "you're", 'before', 'below', 'but', 's', 'will', 'you', 'yourselves', 'same', 'of', 'after', 'o', 'her', 'too', 'its', 'has', 'be', 'for', 'ma', 'whom', "mustn't", 'once', "needn't", "wasn't", 'theirs', 'about', 'under', 'then', 'more', 'shouldn', 'when', 'through', 'won', "haven't", 'm', 'now', 'nor', 'wasn', "should've", 'we', 'his', 're', 'what', 'at', 'yours', "hadn't", "shouldn't", 'here', 'until', 'ain', 'himself', 'hers', 'y', 'd', 'them', 'as', "hasn't", 'if', 'did', 'or', 'how', 'they', 'during', 'a', 'out', 'does

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


<WordNetLemmatizer>

Unnamed: 0,Label,Text
0,Spam,viiiiiiagraaaa one want make scream prodigy sc...
1,Ham,get ice think look az original message ice ope...
2,Spam,yo wom ne ed escapenumber ch n b e th n f h er...
3,Spam,start increase odds success live sexually heal...
4,Ham,author jra date escapenumber escapenumber esca...
...,...,...
193847,Ham,escapenumber escapenumber escapenumber rob dix...
193848,Spam,everything need escapelong cialescapenumbers s...
193849,Ham,hi quick question say date variable data frame...
193850,Spam,thank loan request recieved escapenumber escap...


In [13]:
LABEL_ENCODER = LabelEncoder()
y_encoded = LABEL_ENCODER.fit_transform(df["Label"])
y_encoded

array([1, 0, 1, ..., 0, 1, 0])

In [14]:
k_fold_splits_dict = {}
for i, (train_index, test_index) in enumerate(K_FOLD.split(X=pd.DataFrame(df["Text"]), y=df["Label"]), start=1):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]
    
    k_fold_splits_dict[f"X_train_Text_{i}"] = df_train.drop(columns=y_columns, axis=1)
    k_fold_splits_dict[f"X_test_Text_{i}"] = df_test.drop(columns=y_columns, axis=1)
    k_fold_splits_dict[f"y_train_{i}"] = y_encoded[train_index]
    k_fold_splits_dict[f"y_test_{i}"] = y_encoded[test_index]

In [15]:
DT_CLASSIFIER = DecisionTreeClassifier(random_state=CONSTANTS.RANDOM_STATE)
DT_CLASSIFIER

In [None]:
MODELS = [DT_CLASSIFIER]
MODELS