### Name: Andrew Tran
### Completed Date: 12/20/2024
### `Proof-of-Concept (PoC)` for Simple `ML` Problem - Classic `Spam` or `Ham` Email Classification

In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import classification_report
import contractions
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from spacy.tokens.doc import Doc
from thinc.types import Floats1d
import spacy_universal_sentence_encoder
import pandas as pd
import numpy as np
import pickle
import time
from dataclasses import dataclass
from typing import NamedTuple, List, Dict, Tuple, Union, cast

In [2]:
@dataclass
class CONSTANTS(NamedTuple):
    NUM_KFOLD_SPLITS = 5
    RANDOM_STATE = 2024
    RANGE_ITERATION_CROSS_VALIDATE = range(1, NUM_KFOLD_SPLITS + 1)

In [3]:
SPACY_USE_NLP_MODEL = spacy_universal_sentence_encoder.load_model("en_use_md")
display(SPACY_USE_NLP_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


<spacy.lang.en.English at 0x3325a8a40>

In [4]:
df = pd.read_csv("../data/spam_ham_data.csv", delimiter="\t", header=None, names=["label", "text"])
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.columns = df.columns.str.capitalize()
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
y_columns = ["Label"]
print(f"Y Columns: {y_columns}")

X_columns = [col for col in df.columns if col not in set(y_columns)]
print(f"X Columns: {X_columns}")

Y Columns: ['Label']
X Columns: ['Text']


In [7]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
K_FOLD = StratifiedKFold(n_splits=CONSTANTS.NUM_KFOLD_SPLITS, shuffle=True, random_state=CONSTANTS.RANDOM_STATE)
K_FOLD

StratifiedKFold(n_splits=5, random_state=2024, shuffle=True)

In [9]:
TEST_SIZE = 1 / CONSTANTS.NUM_KFOLD_SPLITS
print(f"TEST_SIZE = {TEST_SIZE}")
SHUFFLE_SPLIT = ShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=CONSTANTS.RANDOM_STATE)
SHUFFLE_SPLIT

TEST_SIZE = 0.2


ShuffleSplit(n_splits=1, random_state=2024, test_size=0.2, train_size=None)

## Syntactic `Preprocessing`

In [10]:
# Downloading NLTK dependenices
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

# NLTK stopwords vocabulary
nltk_en_stopwords = set(stopwords.words("english"))
print(f"STOPWORDS SET: {nltk_en_stopwords}")

nltk_word_lemmatizer = WordNetLemmatizer()
display(nltk_word_lemmatizer)

# Regex here from documentation catches all punctuation but avoids contractions (resource link provided above for reference)
nltk_regex_tokenizer = RegexpTokenizer(r'\w+')

def lemmatizer_convert_penn_treeback_parts_of_speech(pt_pos: str) -> str:
    if pt_pos.startswith("N"): # noun case
        return "n"
    elif pt_pos.startswith("J"): # adjective case
        return "a"
    elif pt_pos.startswith("V"): # verb case
        return "v"
    elif pt_pos.startswith("R"): # adverb base
        return "r"
    else: # All other cases - default to noun case
        return "n"

# Syntactically processing each passed-in column using Python NLTK (mostly) and Contraction Libaries
def nltk_process_column(dataframe: pd.DataFrame, col_name: str) -> pd.Series:
    # Assert statement to guard against potentially passing a column name in the dataframe which does not exist
    assert col_name in dataframe, f"Error: Column {col_name} not in the dataframe passed to this function!"
    
    nltk_processed_entries: List[str] = []
    for i, sentence_entry_text in enumerate(dataframe[col_name]):
        # print(f"[{i}] Old {col_name}: {sentence_entry_text}")

        # Resolving contractions by breaking them up if seen in each sentence
        sentence_entry_text_contractions_fixed: List[str] = []
        for word in str(sentence_entry_text).strip().split():
            contractions_fixed = cast(
                Union[Tuple[str, List[Tuple[str, str]]], str], contractions.fix(word)
            )
            # Flatten the tuple (Tuple[str, List[Tuple[str, str]]]) into a list of strings
            if isinstance(contractions_fixed, Tuple):
                (tuple_string, list_of_tuples) = contractions_fixed
                
                # Flatten the list of tuples into individual strings and append to the result
                flattened_list = [tuple_string]
                flattened_list.extend([item for sublist in list_of_tuples for item in sublist])
                sentence_entry_text_contractions_fixed.extend(flattened_list)
            else: # isinstance(contractions_fixed, str)
                sentence_entry_text_contractions_fixed.append(contractions_fixed)
        
        # Splitting each sentence into words
        sentence_entry_word_tokens: List[str] = nltk_regex_tokenizer.tokenize(" ".join(sentence_entry_text_contractions_fixed).rstrip())
        # print(f"Contraction + Tokenize: {sentence_entry_word_tokens}")
        
        # Removing stopwords and lowercasing all words in each sentence
        sentence_entry_stopwords_filtered_tokens: List[str] = []
        for word in sentence_entry_word_tokens:
            if word.lower() not in nltk_en_stopwords:
                sentence_entry_stopwords_filtered_tokens.append(word.lower())
        # print(f"Stopwords + Lowercase: {sentence_entry_stopwords_filtered_tokens}")
        
        # Parts-Of-Speech Tagging and Lemmatization of all words in each sentence
        sentence_entry_lemmatized_text: List[str] = []
        sentence_entry_pos_tagged_tokens = pos_tag(sentence_entry_stopwords_filtered_tokens)
        for (word, penn_treeback_part_of_speech) in sentence_entry_pos_tagged_tokens:
            lemmatizer_part_of_speech = lemmatizer_convert_penn_treeback_parts_of_speech(penn_treeback_part_of_speech)
            sentence_entry_lemmatized_text.append(nltk_word_lemmatizer.lemmatize(word, pos=lemmatizer_part_of_speech))
        
        # Joining all NLTK-processed words in each sentence back into one string    
        final_nltk_processed_sentence = " ".join(sentence_entry_lemmatized_text).rstrip()
        # print(f"[{i}] New {col_name}: {final_nltk_processed_sentence}")
        # print("-----------------")
        nltk_processed_entries.append(final_nltk_processed_sentence)
    return pd.Series(nltk_processed_entries)

# print("-----------------")
for col_name in X_columns:
    df[col_name] = nltk_process_column(df, col_name)
display(df)

STOPWORDS SET: {'now', 'an', 'why', 'other', 'not', 'didn', 'haven', 'a', 'to', 'y', 'so', "should've", 'have', 'yourselves', 'll', 'did', 'he', 'ourselves', 'about', 'what', "hasn't", "you've", 'above', 'up', 'once', 'your', 'i', 'theirs', "shouldn't", 've', 'for', 'she', "aren't", 'can', 'shan', 'no', 'own', "couldn't", 'but', 'been', 'down', 'or', 'by', 'd', 'hadn', 't', 'between', 'wouldn', "you're", 'being', 'at', 'his', 'most', 'mustn', 'couldn', "didn't", 'because', 'all', 'if', 'itself', 'any', 'are', 'them', "you'd", 'has', 'it', 'themselves', 'under', 'that', "wouldn't", "it's", 'in', 'needn', 'these', 'through', 'won', 'weren', 'is', 'him', 'those', 'such', 'myself', 'do', 'wasn', "don't", 'yourself', 'nor', "doesn't", "won't", 'we', 'where', 'each', 'which', 'himself', 'there', 'you', 'the', 're', 'while', 'shouldn', 'doing', 'as', 'below', 'hasn', 'hers', 'be', 'of', "wasn't", 'only', 'her', 'off', 'should', 'few', 'don', 'ma', 'over', 'very', "shan't", 'against', 'further

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/andrewtran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


<WordNetLemmatizer>

Unnamed: 0,Label,Text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,dun say early hor c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,2nd time try 2 contact 750 pound prize 2 claim...
5568,ham,ü b go esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitch act like would interested buying som...


In [11]:
LABEL_ENCODER = LabelEncoder()
y_encoded = LABEL_ENCODER.fit_transform(df["Label"])
y_encoded

array([0, 0, 1, ..., 0, 0, 0])

In [12]:
label_encoding_mapping: Dict[int, str] = {index: label for index, label in enumerate(LABEL_ENCODER.classes_)}
print(f"Label Encoding Mapping: {label_encoding_mapping}")

Label Encoding Mapping: {0: 'ham', 1: 'spam'}


In [13]:
with open("../models/classification_label_encoder_mapping_results.pkl", "wb") as label_encoder_file:
    pickle.dump(label_encoding_mapping, label_encoder_file)

In [14]:
k_fold_splits_dict = {}
for i, (train, test) in enumerate(K_FOLD.split(X=pd.DataFrame(df["Text"]), y=df["Label"]), start=1):
    # df_train: pd.DataFrame = df.iloc[train]
    # df_test: pd.DataFrame = df.iloc[test]
    
    k_fold_splits_dict[f"X_train_Text_{i}"] = df["Text"][train]
    k_fold_splits_dict[f"X_test_Text_{i}"] = df["Text"][test]
    k_fold_splits_dict[f"y_train_{i}"] = y_encoded[train]
    k_fold_splits_dict[f"y_test_{i}"] = y_encoded[test]

## Hyperparameter Tuning for `Decision Tree` Classifier (using `GridSearchCV`)

In [15]:
def convert_column_to_spacy_use_sentence_embeddings(column: pd.Series) -> List[Floats1d]:
    docs: List[Doc] = []
    for (idx, item) in column.items():
        docs.append(SPACY_USE_NLP_MODEL(item))
    return list(map(lambda doc: doc.vector, docs))

In [16]:
# dt_classifier_param_grid = {
# 	"criterion": ["gini", "entropy", "log_loss"],
#     "splitter": ["best", "random"],
#     "max_depth": [None, 5, 10, 20, 50],
#     "min_samples_split": [2, 5, 10, 20],
#     "min_samples_leaf": [1, 2, 5, 10],
#     "max_features": [None, "sqrt", "log2"],
#     "max_leaf_nodes": [None, 5, 10, 20, 50],
#     "min_impurity_decrease": [0.0, 0.01, 0.1],
# }

# DT_CLASSIFIER_BASE_MODEL = DecisionTreeClassifier(random_state=CONSTANTS.RANDOM_STATE)
# display(DT_CLASSIFIER_BASE_MODEL)

# grid_search_dt_classifier = GridSearchCV(estimator=DT_CLASSIFIER_BASE_MODEL,
#                                         param_grid=dt_classifier_param_grid, 
#                                         cv=SHUFFLE_SPLIT,
#                                         n_jobs=-1,
#                                         verbose=1,
#                                         scoring="accuracy")
# grid_search_dt_classifier.fit(X=convert_column_to_spacy_use_sentence_embeddings(df["Text"]), y=y_encoded)

# gs_dt_classifier_best_params: Dict = grid_search_dt_classifier.best_params_
# print("Best DT Parameters:", gs_dt_classifier_best_params)
# print("Best DT Cross-Validation Accuracy:", grid_search_dt_classifier.best_score_)

In [17]:
# # Saving variables to a file
# with open("../models/dt_grid_search_best_params.pkl", "wb") as save_params_file:
#     pickle.dump(gs_dt_classifier_best_params, save_params_file) 

In [18]:
# # Loading variables from a file
# with open("../models/dt_grid_search_best_params.pkl", "rb") as load_params_file:
#     gs_dt_classifier_best_params: Dict = pickle.load(load_params_file)

In [19]:
# DT_CLASSIFIER = DecisionTreeClassifier(random_state=CONSTANTS.RANDOM_STATE,
#                                     criterion=gs_dt_classifier_best_params.get("criterion", "gini"),
#                                     splitter=gs_dt_classifier_best_params.get("splitter", "best"),
#                                     max_depth=gs_dt_classifier_best_params.get("max_depth", None),
#                                     min_samples_split=gs_dt_classifier_best_params.get("min_samples_split", 2),
#                                     min_samples_leaf=gs_dt_classifier_best_params.get("min_samples_leaf", 1),
#                                     max_features=gs_dt_classifier_best_params.get("max_features", None),
#                                     max_leaf_nodes=gs_dt_classifier_best_params.get("max_leaf_nodes", None),
#                                     min_impurity_decrease=gs_dt_classifier_best_params.get("min_impurity_decrease", 0.0))
# display(DT_CLASSIFIER)

In [20]:
DT_CLASSIFIER = DecisionTreeClassifier(random_state=CONSTANTS.RANDOM_STATE)
display(DT_CLASSIFIER)

## Machine Learning: `Model Performance Evaluation`

In [21]:
# Compute the performance of the Machine Learning classifier with this "Ham vs. Spam" Email Classification Dataset
def evaluate_classifier_performance(ml_model: DecisionTreeClassifier) -> None:
    print()
    print("-" * 100)
    print(f"Using {ml_model.__class__.__name__} for Performance Evaluation:")
    display(ml_model)
    
    for i in CONSTANTS.RANGE_ITERATION_CROSS_VALIDATE:
        start_time = time.time()
    
        X_train_Text_series: pd.Series = k_fold_splits_dict[f"X_train_Text_{i}"]
        text_train_docs: List[Doc] = []
        for (idx, item) in X_train_Text_series.items():
            text_train_docs.append(SPACY_USE_NLP_MODEL(item))
        X_train_Text_use: List[Floats1d] = list(map(lambda doc: doc.vector, text_train_docs))
    
        X_test_Text_series: pd.Series = k_fold_splits_dict[f"X_test_Text_{i}"]
        text_test_docs: List[Doc] = []
        for (idx, item) in X_test_Text_series.items():
            text_test_docs.append(SPACY_USE_NLP_MODEL(item))
        X_test_Test_use: List[Floats1d] = list(map(lambda doc: doc.vector, text_test_docs))
        
        y_train: pd.Series = k_fold_splits_dict[f"y_train_{i}"]
        y_true: pd.Series = k_fold_splits_dict[f"y_test_{i}"]
        
        X_train: np.ndarray = np.asarray(X_train_Text_use)
        X_test: np.ndarray = np.asarray(X_test_Test_use)
        
        ml_model.fit(X=X_train, y=y_train)
        y_pred: np.ndarray = ml_model.predict(X=X_test)
        
        print(f"Classification Report for Iteration {i}:")
        clf_report_dict = classification_report(y_true, y_pred, output_dict=True)
        print(type(clf_report_dict))
        print(clf_report_dict)
        display(clf_report_dict)
        
        end_time = time.time()
        time_taken = end_time - start_time
        print(f"Time Taken for Iteration #{i}: {time_taken:.2f} seconds", "" if time_taken < 60.00 else f"(or {time_taken // 60:.2f} minutes and {time_taken % 60:.2f} seconds)")
        print()
        
    return None

In [22]:
MODELS = [DT_CLASSIFIER]
for model in MODELS:
    evaluate_classifier_performance(model)


----------------------------------------------------------------------------------------------------
Using DecisionTreeClassifier for Performance Evaluation:


Classification Report for Iteration 1:
<class 'dict'>
{'0': {'precision': 0.974816369359916, 'recall': 0.9626943005181348, 'f1-score': 0.9687174139728885, 'support': 965.0}, '1': {'precision': 0.7777777777777778, 'recall': 0.84, 'f1-score': 0.8076923076923077, 'support': 150.0}, 'accuracy': 0.9461883408071748, 'macro avg': {'precision': 0.8762970735688469, 'recall': 0.9013471502590673, 'f1-score': 0.8882048608325981, 'support': 1115.0}, 'weighted avg': {'precision': 0.9483089355147853, 'recall': 0.9461883408071748, 'f1-score': 0.9470548436212408, 'support': 1115.0}}


{'0': {'precision': 0.974816369359916,
  'recall': 0.9626943005181348,
  'f1-score': 0.9687174139728885,
  'support': 965.0},
 '1': {'precision': 0.7777777777777778,
  'recall': 0.84,
  'f1-score': 0.8076923076923077,
  'support': 150.0},
 'accuracy': 0.9461883408071748,
 'macro avg': {'precision': 0.8762970735688469,
  'recall': 0.9013471502590673,
  'f1-score': 0.8882048608325981,
  'support': 1115.0},
 'weighted avg': {'precision': 0.9483089355147853,
  'recall': 0.9461883408071748,
  'f1-score': 0.9470548436212408,
  'support': 1115.0}}

Time Taken for Iteration #1: 12.43 seconds 

Classification Report for Iteration 2:
<class 'dict'>
{'0': {'precision': 0.974012474012474, 'recall': 0.9709844559585492, 'f1-score': 0.9724961079398028, 'support': 965.0}, '1': {'precision': 0.8169934640522876, 'recall': 0.8333333333333334, 'f1-score': 0.8250825082508251, 'support': 150.0}, 'accuracy': 0.9524663677130045, 'macro avg': {'precision': 0.8955029690323808, 'recall': 0.9021588946459413, 'f1-score': 0.898789308095314, 'support': 1115.0}, 'weighted avg': {'precision': 0.9528888403855431, 'recall': 0.9524663677130045, 'f1-score': 0.9526646819726757, 'support': 1115.0}}


{'0': {'precision': 0.974012474012474,
  'recall': 0.9709844559585492,
  'f1-score': 0.9724961079398028,
  'support': 965.0},
 '1': {'precision': 0.8169934640522876,
  'recall': 0.8333333333333334,
  'f1-score': 0.8250825082508251,
  'support': 150.0},
 'accuracy': 0.9524663677130045,
 'macro avg': {'precision': 0.8955029690323808,
  'recall': 0.9021588946459413,
  'f1-score': 0.898789308095314,
  'support': 1115.0},
 'weighted avg': {'precision': 0.9528888403855431,
  'recall': 0.9524663677130045,
  'f1-score': 0.9526646819726757,
  'support': 1115.0}}

Time Taken for Iteration #2: 4.66 seconds 

Classification Report for Iteration 3:
<class 'dict'>
{'0': {'precision': 0.9648397104446742, 'recall': 0.966839378238342, 'f1-score': 0.9658385093167702, 'support': 965.0}, '1': {'precision': 0.782312925170068, 'recall': 0.7718120805369127, 'f1-score': 0.777027027027027, 'support': 149.0}, 'accuracy': 0.940754039497307, 'macro avg': {'precision': 0.8735763178073711, 'recall': 0.8693257293876273, 'f1-score': 0.8714327681718985, 'support': 1114.0}, 'weighted avg': {'precision': 0.9404263432939415, 'recall': 0.940754039497307, 'f1-score': 0.9405845498363646, 'support': 1114.0}}


{'0': {'precision': 0.9648397104446742,
  'recall': 0.966839378238342,
  'f1-score': 0.9658385093167702,
  'support': 965.0},
 '1': {'precision': 0.782312925170068,
  'recall': 0.7718120805369127,
  'f1-score': 0.777027027027027,
  'support': 149.0},
 'accuracy': 0.940754039497307,
 'macro avg': {'precision': 0.8735763178073711,
  'recall': 0.8693257293876273,
  'f1-score': 0.8714327681718985,
  'support': 1114.0},
 'weighted avg': {'precision': 0.9404263432939415,
  'recall': 0.940754039497307,
  'f1-score': 0.9405845498363646,
  'support': 1114.0}}

Time Taken for Iteration #3: 4.57 seconds 

Classification Report for Iteration 4:
<class 'dict'>
{'0': {'precision': 0.9710444674250258, 'recall': 0.9730569948186528, 'f1-score': 0.9720496894409938, 'support': 965.0}, '1': {'precision': 0.8231292517006803, 'recall': 0.8120805369127517, 'f1-score': 0.8175675675675675, 'support': 149.0}, 'accuracy': 0.9515260323159784, 'macro avg': {'precision': 0.8970868595628531, 'recall': 0.8925687658657022, 'f1-score': 0.8948086285042807, 'support': 1114.0}, 'weighted avg': {'precision': 0.9512604753757193, 'recall': 0.9515260323159784, 'f1-score': 0.9513873589570258, 'support': 1114.0}}


{'0': {'precision': 0.9710444674250258,
  'recall': 0.9730569948186528,
  'f1-score': 0.9720496894409938,
  'support': 965.0},
 '1': {'precision': 0.8231292517006803,
  'recall': 0.8120805369127517,
  'f1-score': 0.8175675675675675,
  'support': 149.0},
 'accuracy': 0.9515260323159784,
 'macro avg': {'precision': 0.8970868595628531,
  'recall': 0.8925687658657022,
  'f1-score': 0.8948086285042807,
  'support': 1114.0},
 'weighted avg': {'precision': 0.9512604753757193,
  'recall': 0.9515260323159784,
  'f1-score': 0.9513873589570258,
  'support': 1114.0}}

Time Taken for Iteration #4: 4.22 seconds 

Classification Report for Iteration 5:
<class 'dict'>
{'0': {'precision': 0.9668737060041408, 'recall': 0.9678756476683937, 'f1-score': 0.9673744174003107, 'support': 965.0}, '1': {'precision': 0.7905405405405406, 'recall': 0.785234899328859, 'f1-score': 0.7878787878787878, 'support': 149.0}, 'accuracy': 0.9434470377019749, 'macro avg': {'precision': 0.8787071232723407, 'recall': 0.8765552734986264, 'f1-score': 0.8776266026395493, 'support': 1114.0}, 'weighted avg': {'precision': 0.943288749402636, 'recall': 0.9434470377019749, 'f1-score': 0.9433664741339669, 'support': 1114.0}}


{'0': {'precision': 0.9668737060041408,
  'recall': 0.9678756476683937,
  'f1-score': 0.9673744174003107,
  'support': 965.0},
 '1': {'precision': 0.7905405405405406,
  'recall': 0.785234899328859,
  'f1-score': 0.7878787878787878,
  'support': 149.0},
 'accuracy': 0.9434470377019749,
 'macro avg': {'precision': 0.8787071232723407,
  'recall': 0.8765552734986264,
  'f1-score': 0.8776266026395493,
  'support': 1114.0},
 'weighted avg': {'precision': 0.943288749402636,
  'recall': 0.9434470377019749,
  'f1-score': 0.9433664741339669,
  'support': 1114.0}}

Time Taken for Iteration #5: 3.93 seconds 



In [23]:
# Save the FInal Decision Tree Classifier ML Model to a file (for later usage - in production)
with open("../models/dt_classifier_model.pkl", "wb") as save_model_file:
    pickle.dump(DT_CLASSIFIER, save_model_file)