### Name: Andrew Tran
### Completed Date: 12/20/2024
### `Proof-of-Concept (PoC)` for Simple `ML` Problem - Classic `Spam` or `Ham` Email Classification

In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import spacy_universal_sentence_encoder
import contractions
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import NamedTuple, List, Union

from src.utils.logging import LOGGER

In [2]:
LOGGER.info("Testing Logger")

[32m[2024-12-19 23:12:02] LN#1   : INFO     - Testing Logger[0m


In [3]:
@dataclass
class CONSTANTS(NamedTuple):
    NUM_SPLITS = 5
    RANDOM_STATE = 2024
    RANGE_ITERATION_CROSS_VALIDATE = range(1, NUM_SPLITS + 1)

In [5]:
df = pd.read_csv("../data/spam_ham_data.csv", delimiter="\t", header=None, names=["label", "text"])
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.columns = df.columns.str.capitalize()
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
y_columns = ["Label"]
print(f"Y Columns: {y_columns}")

X_columns = [col for col in df.columns if col not in set(y_columns)]
print(f"X Columns: {X_columns}")

Y Columns: ['Label']
X Columns: ['Text']


In [8]:
# for i, (text, label) in enumerate(zip(df["Text"], df["Label"]), start=1):
#     print(f"Row - {i}; Label: {label}; Text: \"{text}\"")

In [9]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
K_FOLD = StratifiedKFold(n_splits=CONSTANTS.NUM_SPLITS, shuffle=True, random_state=CONSTANTS.RANDOM_STATE)
K_FOLD

StratifiedKFold(n_splits=5, random_state=2024, shuffle=True)

In [11]:
# Downloading NLTK dependenices
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

# NLTK stopwords vocabulary
nltk_en_stopwords = set(stopwords.words("english"))
print(f"STOPWORDS SET: {nltk_en_stopwords}")

nltk_word_lemmatizer = WordNetLemmatizer()
display(nltk_word_lemmatizer)

# Regex here from documentation catches all punctuation but avoids contractions (resource link provided above for reference)
nltk_regex_tokenizer = RegexpTokenizer(r'\w+')

def lemmatizer_convert_penn_treeback_parts_of_speech(pt_pos: str) -> str:
    if pt_pos.startswith("N"): # noun case
        return "n"
    elif pt_pos.startswith("J"): # adjective case
        return "a"
    elif pt_pos.startswith("V"): # verb case
        return "v"
    elif pt_pos.startswith("R"): # adverb base
        return "r"
    else: # All other cases - default to noun case
        return "n"

# Syntactically processing each passed-in column using Python NLTK (mostly) and Contraction Libaries
def nltk_process_column(dataframe: pd.DataFrame, col_name: str) -> pd.Series:
    # Assert statement to guard against potentially passing a column name in the dataframe which does not exist
    assert col_name in dataframe, f"Error: Column {col_name} not in the dataframe passed to this function!"
    
    nltk_processed_entries: List[str] = []
    for i, sentence_entry_text in enumerate(dataframe[col_name]):
        # print(f"[{i}] Old {col_name}: {sentence_entry_text}")

        # Resolving contractions by breaking them up if seen in each sentence
        sentence_entry_text_contractions_fixed: List[str] = []
        for word in str(sentence_entry_text).strip().split():
            contractions_fixed: Union[List[str], str] = contractions.fix(word)
            if isinstance(contractions_fixed, list):
                sentence_entry_text_contractions_fixed.extend(contractions_fixed)
            else:
                sentence_entry_text_contractions_fixed.append(contractions_fixed)
        
        # Splitting each sentence into words
        sentence_entry_word_tokens: List[str] = nltk_regex_tokenizer.tokenize(" ".join(sentence_entry_text_contractions_fixed).rstrip())
        # print(f"Contraction + Tokenize: {sentence_entry_word_tokens}")
        
        # Removing stopwords and lowercasing all words in each sentence
        sentence_entry_stopwords_filtered_tokens: List[str] = []
        for word in sentence_entry_word_tokens:
            if word.lower() not in nltk_en_stopwords:
                sentence_entry_stopwords_filtered_tokens.append(word.lower())
        # print(f"Stopwords + Lowercase: {sentence_entry_stopwords_filtered_tokens}")
        
        # Parts-Of-Speech Tagging and Lemmatization of all words in each sentence
        sentence_entry_lemmatized_text: List[str] = []
        sentence_entry_pos_tagged_tokens = pos_tag(sentence_entry_stopwords_filtered_tokens)
        for (word, penn_treeback_part_of_speech) in sentence_entry_pos_tagged_tokens:
            lemmatizer_part_of_speech = lemmatizer_convert_penn_treeback_parts_of_speech(penn_treeback_part_of_speech)
            sentence_entry_lemmatized_text.append(nltk_word_lemmatizer.lemmatize(word, pos=lemmatizer_part_of_speech))
        
        # Joining all NLTK-processed words in each sentence back into one string    
        final_nltk_processed_sentence = " ".join(sentence_entry_lemmatized_text).rstrip()
        # print(f"[{i}] New {col_name}: {final_nltk_processed_sentence}")
        # print("-----------------")
        nltk_processed_entries.append(final_nltk_processed_sentence)
    return pd.Series(nltk_processed_entries)

# print("-----------------")
for col_name in X_columns:
    df[col_name] = nltk_process_column(df, col_name)
df

STOPWORDS SET: {'is', 'if', 'up', 'because', 'until', 'didn', 'y', 'wouldn', "don't", "hadn't", 'ourselves', 'couldn', "wouldn't", 'hadn', 'has', 'they', 'whom', 'yours', 'doing', 'at', 'again', 'such', 'themselves', 'by', 'more', 'd', 'with', 'where', 'are', 've', 'most', 'you', 'those', "needn't", 'yourself', 'nor', 'through', 'ain', "she's", 'but', 'off', 'will', 'than', 're', 'what', 'them', 'himself', 'there', 'for', 'our', 'all', "isn't", 'mightn', 'his', 'during', 'him', 'm', 'over', 'we', 'her', 'was', 'll', "weren't", 'very', 'which', 'my', 'before', 'hasn', 'below', 'or', "didn't", 'other', 'of', 'yourselves', 'it', 'been', "you'd", 'won', "that'll", 'hers', 'can', 'aren', "shan't", 'as', 'ma', 'she', 'having', 'down', 'after', "won't", 'am', 'theirs', 'about', 'had', 'both', 'these', 'to', 'only', 'that', "you'll", 'each', "haven't", "mightn't", 'mustn', 'he', "aren't", 'haven', 'and', 'own', 'further', 'do', 'a', 'their', 'o', "it's", 'herself', 'above', "mustn't", 'shan', 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


<WordNetLemmatizer>

Unnamed: 0,Label,Text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,dun say early hor c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,2nd time try 2 contact 750 pound prize 2 claim...
5568,ham,ü b go esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitch act like would interested buying som...


In [12]:
LABEL_ENCODER = LabelEncoder()
y_encoded = LABEL_ENCODER.fit_transform(df["Label"])
y_encoded

array([0, 0, 1, ..., 0, 0, 0])

In [13]:
k_fold_splits_dict = {}
for i, (train_index, test_index) in enumerate(K_FOLD.split(X=pd.DataFrame(df["Text"]), y=df["Label"]), start=1):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]
    
    k_fold_splits_dict[f"X_train_Text_{i}"] = df_train.drop(columns=y_columns, axis=1)
    k_fold_splits_dict[f"X_test_Text_{i}"] = df_test.drop(columns=y_columns, axis=1)
    k_fold_splits_dict[f"y_train_{i}"] = y_encoded[train_index]
    k_fold_splits_dict[f"y_test_{i}"] = y_encoded[test_index]

In [14]:
DT_CLASSIFIER = DecisionTreeClassifier(random_state=CONSTANTS.RANDOM_STATE)
DT_CLASSIFIER

In [15]:
MODELS = [DT_CLASSIFIER]
MODELS

[DecisionTreeClassifier(random_state=2024)]