Question 2


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import graphviz
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import export_graphviz

from collections import Counter
import time

# Load the cleaned dataset
file_path = "./sms_cleaned.csv"
df = pd.read_csv(file_path)

# Set a fixed random state for reproducibility
random_state = 42  # Fixed seed
print(f'Random State = {random_state}')

# Train-Test Split (80:20) with stratified sampling
X = df["cleaned_sms_message"]
y = df["label"]

train_corpus, test_corpus, train_label, test_label = train_test_split(
    np.array(X), np.array(y), test_size=0.20, stratify=y, shuffle=True, random_state=random_state
)

# Check class distribution in training and test sets
trd = dict(Counter(train_label))
tsd = dict(Counter(test_label))
print("\nClass Distribution in Training and Test Sets:")
print(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
                   columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count'], ascending=False))

print("\nProportion of ham to spam in training:", round(trd['ham'] / trd['spam'], 2))
print("Proportion of ham to spam in testing:", round(tsd['ham'] / tsd['spam'], 2))

# Feature Engineering - Bag of Words (BoW)
cv = CountVectorizer(min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
cv_test_features = cv.transform(test_corpus)

print("\nBOW Model:> Train features shape:", cv_train_features.shape, " Test features shape:", cv_test_features.shape)
print("Number of Terms Extracted (BoW):", len(cv.get_feature_names_out()))

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(criterion="gini", max_depth=5, splitter="best", random_state=random_state)
clf.fit(cv_train_features, train_label)

# Perform Cross-Validation on Training Data
cv_scores = cross_val_score(clf, cv_train_features, train_label, cv=5)
print("\nDecision Tree Cross-Validation Accuracy (5-Fold):", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Evaluate on Test Set
y_pred = clf.predict(cv_test_features)

test_accuracy = accuracy_score(test_label, y_pred)
conf_matrix = confusion_matrix(test_label, y_pred)
class_report = classification_report(test_label, y_pred, target_names=['ham', 'spam'])

print("\nDecision Tree Test Accuracy:", test_accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

# Decision Tree Visualization
dot_data = export_graphviz(
    clf, out_file=None, feature_names=cv.get_feature_names_out(),
    class_names=["ham", "spam"], filled=True, rounded=True, special_characters=True
)

graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Saves as 'decision_tree.pdf'
print("\nDecision tree visualization saved as decision_tree.pdf")


Random State = 42

Class Distribution in Training and Test Sets:
  Target Label  Train Count  Test Count
0          ham         3847         963
1         spam          598         149

Proportion of ham to spam in training: 6.43
Proportion of ham to spam in testing: 6.46

BOW Model:> Train features shape: (4445, 4539)  Test features shape: (1112, 4539)
Number of Terms Extracted (BoW): 4539

Decision Tree Cross-Validation Accuracy (5-Fold): [0.91788526 0.9223847  0.92688414 0.92463442 0.92575928]
Mean CV Accuracy: 0.9235095613048369

Decision Tree Test Accuracy: 0.9262589928057554

Confusion Matrix:
 [[956   7]
 [ 75  74]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.93      0.99      0.96       963
        spam       0.91      0.50      0.64       149

    accuracy                           0.93      1112
   macro avg       0.92      0.74      0.80      1112
weighted avg       0.93      0.93      0.92      1112


Decision tree vi