Question 3 

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from gensim.models import Word2Vec
from scipy.sparse import hstack

# Load cleaned dataset
file_path = "sms_cleaned.csv"  # Ensure file is in the same directory
sms_df = pd.read_csv(file_path)

# Train-Test Split (80:20) with stratified sampling
X = sms_df["cleaned_sms_message"]
y = sms_df["label"]

random_state = 189357722
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)

# Define vectorizers
vectorizers = {
    "Bag of Words": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}

# Train Word2Vec model
w2v_model = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4)

# Function to get average Word2Vec embeddings for a sentence
def get_w2v_features(text, model, vector_size=100):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)

# Get Word2Vec features for training & testing sets
X_train_w2v = np.array([get_w2v_features(text, w2v_model) for text in X_train])
X_test_w2v = np.array([get_w2v_features(text, w2v_model) for text in X_test])

# Define classifiers
models = {
    "Decision Tree": DecisionTreeClassifier(criterion="gini", max_depth=5, splitter="best", random_state=random_state),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": SVC(kernel="linear", probability=True, class_weight={"ham": 1, "spam": 2}, random_state=random_state)
}

# Store results
results = []

# Train and evaluate models using Bag of Words & TF-IDF
for vec_name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    for model_name, model in models.items():
        # Train model
        model.fit(X_train_vec, y_train)

        # Cross-validation accuracy
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
        cv_scores = cross_val_score(model, X_train_vec, y_train, cv=cv, scoring="accuracy")

        # Test set evaluation
        y_pred = model.predict(X_test_vec)
        test_accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, target_names=["ham", "spam"], output_dict=True)

        # Extract recall for spam class
        spam_recall = class_report["spam"]["recall"]

        # Store results
        results.append({
            "Feature Model": vec_name,
            "Classifier": model_name,
            "Mean Train Accuracy (CV)": np.mean(cv_scores),
            "Test Accuracy": test_accuracy,
            "Spam Recall": spam_recall,
            "Confusion Matrix": conf_matrix
        })

# Train & Evaluate Word2Vec Models (Skip Naïve Bayes)
for model_name, model in models.items():
    if model_name == "Naive Bayes":
        print(f"Skipping {model_name} for Word2Vec (it does not support negative values).")
        continue  # Skip Naïve Bayes for Word2Vec
    
    model.fit(X_train_w2v, y_train)
    
    # Cross-validation accuracy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_scores = cross_val_score(model, X_train_w2v, y_train, cv=cv, scoring="accuracy")

    # Test set evaluation
    y_pred = model.predict(X_test_w2v)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=["ham", "spam"], output_dict=True)

    # Extract recall for spam class
    spam_recall = class_report["spam"]["recall"]

    # Store results
    results.append({
        "Feature Model": "Word2Vec",
        "Classifier": model_name,
        "Mean Train Accuracy (CV)": np.mean(cv_scores),
        "Test Accuracy": test_accuracy,
        "Spam Recall": spam_recall,
        "Confusion Matrix": conf_matrix
    })

# Convert results to DataFrame for comparison
results_df = pd.DataFrame(results)

# Display best model based on test accuracy
best_model = results_df.sort_values(by="Test Accuracy", ascending=False).iloc[0]
print("\nBest Model Performance:")
print(best_model)

# Display all model results
print("\nAll Model Results:")
print(results_df)


Skipping Naive Bayes for Word2Vec (it does not support negative values).


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Best Model Performance:
Feature Model                              TF-IDF
Classifier                                    SVM
Mean Train Accuracy (CV)                 0.975253
Test Accuracy                            0.986511
Spam Recall                              0.926174
Confusion Matrix            [[959, 4], [11, 138]]
Name: 7, dtype: object

All Model Results:
   Feature Model           Classifier  Mean Train Accuracy (CV)  \
0   Bag of Words        Decision Tree                  0.920360   
1   Bag of Words          Naive Bayes                  0.970754   
2   Bag of Words  Logistic Regression                  0.977278   
3   Bag of Words                  SVM                  0.972553   
4         TF-IDF        Decision Tree                  0.930259   
5         TF-IDF          Naive Bayes                  0.956130   
6         TF-IDF  Logistic Regression                  0.951856   
7         TF-IDF                  SVM                  0.975253   
8       Word2Vec        Decis