In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn.metrics as metrics

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from itertools import product

from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [2]:
# Load the dataset
import requests
from io import StringIO

# Choosing to get the data as a text buffer
url = 'https://storm.cis.fordham.edu/~gweiss/classes/cisc5660/data/sms-spam-dataset.csv'
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))

In [3]:
# Split the data into training and testing sets
X = data['Text']
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Vectorization and Preprocessing Parameters (Some taken from homework file)
vectorizers = {'Counts': CountVectorizer(min_df=5, lowercase=True, stop_words='english', max_features=2000), 'TFIDF': TfidfVectorizer()}

In [5]:
# Try different vectorization parameters
vectorizer_params = {
    'min_df': [1, 2],
    'lowercase': [True, False],
    'ngram_range': [(1, 1), (1, 2)],
    'max_features': [500, 1000, None]
}

In [6]:
# Try different algorithms
algorithms = {
    'MultinomialNB': MultinomialNB(),
    'ComplementNB': ComplementNB(),
}

In [7]:
# Try different class imbalance strategies
balancing_strategies = {
    'Unbal': None,
    'Random Oversampling': RandomOverSampler(),
    'SMOTE': SMOTE()
}

In [8]:
# create an empty list to store results
results = []

In [9]:
# Map variables for ham and spam
label_map = {'ham': 0, 'spam': 1}

# Loop over all possible combinations of vectorizers, vectorizer parameters, algorithms, and balancing strategies
for (vect_name, vectorizer), (param_min_df, param_lowercase, param_ngram_range, param_max_features), (algorithm_name, algorithm), (bal_name, bal_strategy) in product(
        vectorizers.items(),
        product(vectorizer_params['min_df'], vectorizer_params['lowercase'], vectorizer_params['ngram_range'], vectorizer_params['max_features']),
        algorithms.items(),
        balancing_strategies.items()
    ):

    # Set max_features parameter in vectorizer if it's not None
    if param_max_features is not None:
        vectorizer.set_params(max_features=param_max_features)
    # Transform training and test data using the vectorizer
    X_train_vect = vectorizer.fit_transform(X_train)
    X_test_vect = vectorizer.transform(X_test)
    # If no balancing strategy is specified, use the original training data
    if bal_strategy is None:
        X_train_bal = X_train_vect
        y_train_bal = y_train
    # Otherwise, use the specified balancing strategy to balance the training data
    else:
        X_train_bal, y_train_bal = bal_strategy.fit_resample(X_train_vect, y_train)

    # Train the algorithm using the balanced training data
    algorithm.fit(X_train_bal, y_train_bal)
    # Predict the labels for the test data
    y_pred = algorithm.predict(X_test_vect)
    # Encode the true and predicted labels as 0 (ham) or 1 (spam)
    y_test_enc = np.array([label_map[label] for label in y_test])
    y_pred_enc = np.array([label_map[label] for label in y_pred])

    # Compute evaluation metrics for the predictions
    accuracy = accuracy_score(y_test_enc, y_pred_enc)
    auc = roc_auc_score(y_test_enc, y_pred_enc)
    f1 = f1_score(y_test_enc, y_pred_enc, pos_label=1)
    precision = precision_score(y_test_enc, y_pred_enc, pos_label=1, zero_division=1)
    recall = recall_score(y_test_enc, y_pred_enc, pos_label=1)
    conf_matrix = confusion_matrix(y_test_enc, y_pred_enc)

    # Append the results to the list of results as a dictionary
    results.append({
        'VType': vect_name,
        'min_df': param_min_df,
        'lowercase': param_lowercase,
        'ngram_range': param_ngram_range,
        'NumFeats': str(param_max_features) if param_max_features is not None else 'None',
        'Algorithm': algorithm_name,
        'BalStrat': bal_name,
        'Accuracy': accuracy,
        'AUC': auc,
        'F1-score': f1,
        'Precision': precision,
        'Recall': recall,
        'Confusion Matrix': conf_matrix
    })

# Convert the list of results to a pandas DataFrame and sort by F1-score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='F1-score', ascending=False)
results_df.reset_index(drop=True, inplace=True)

# Create a table to insert results
styled_results = results_df.style.set_properties(**{'text-align': 'center'})\
                .set_table_styles([{'selector': 'th',
                                    'props': [('border', '1px solid black'),
                                              ('background-color', '#f2f2f2'),
                                              ('font-weight', 'bold')]},
                                   {'selector': 'td',
                                    'props': [('border', '1px solid black')]}])

display(styled_results)

Unnamed: 0,VType,min_df,lowercase,ngram_range,NumFeats,Algorithm,BalStrat,Accuracy,AUC,F1-score,Precision,Recall,Confusion Matrix
0,Counts,2,True,"(1, 1)",,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
1,Counts,1,False,"(1, 2)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
2,Counts,1,False,"(1, 1)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
3,Counts,1,True,"(1, 2)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
4,Counts,2,True,"(1, 1)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
5,Counts,1,True,"(1, 2)",,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
6,Counts,2,True,"(1, 2)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
7,Counts,1,True,"(1, 1)",,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
8,Counts,2,False,"(1, 2)",1000.0,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]
9,Counts,1,False,"(1, 2)",,MultinomialNB,Unbal,0.982063,0.95304,0.931973,0.951389,0.913333,[[958 7]  [ 13 137]]


In [17]:
# get the maximum accuracy value and the index of the experiment where it was achieved
best_accuracy = results_df['Accuracy'].max()
best_accuracy_exp = results_df['Accuracy'].idxmax()

# get the maximum AUC value and the index of the experiment where it was achieved
best_auc = results_df['AUC'].max()
best_auc_exp = results_df['AUC'].idxmax()

# get the maximum F1-score value and the index of the experiment where it was achieved
best_f1 = results_df['F1-score'].max()
best_f1_exp = results_df['F1-score'].idxmax()

# print the best results for accuracy, AUC, and F1-score
print("Summary of the globally best results for accuracy, AUC, and F-measure")
print("Max accuracy of {:.4f} achieved in experiment {}.".format(best_accuracy*100, 1))
print("Max AUC of {:.4f} achieved in experiment {}.".format(best_auc, 1))
print("Max F1-score of {:.4f} achieved in experiment {}.".format(best_f1, 1))


Summary of the globally best results for accuracy, AUC, and F-measure
Max accuracy of 98.2063 achieved in experiment 1.
Max AUC of 0.9530 achieved in experiment 1.
Max F1-score of 0.9320 achieved in experiment 1.


In [18]:
# Set algorithm, balancing strategy, and vectorizer for classification
algorithm_name = 'MultinomialNB'
algorithm = algorithms[algorithm_name]  # select algorithm to use
bal_name = 'Unbal'
bal_strategy = balancing_strategies[bal_name]  # select balancing strategy to use
vect_name = 'TFIDF'
vectorizer = vectorizers[vect_name]  # select vectorizer to use

# vectorize the training and test data
X_train_vect = vectorizer.fit_transform(X_train)  
X_test_vect = vectorizer.transform(X_test)  

# train the algorithm on the vectorized training data and predict the test data labels
algorithm.fit(X_train_vect, y_train) 
y_pred = algorithm.predict(X_test_vect)  

# Compute the confusion matrix for the test data predictions
conf_matrix = confusion_matrix(y_test, y_pred, labels=['ham', 'spam'])  # compute the confusion matrix using the predicted labels and true labels
print(f'{algorithm_name} Test Set Confusion Matrix') 
print(conf_matrix)  # print the confusion matrix```

MultinomialNB Test Set Confusion Matrix
[[963   2]
 [ 30 120]]


In [27]:
# Best 20 features for spam
tfidf_vectorizer = TfidfVectorizer(min_df=2, lowercase=False, ngram_range=(1,2), max_features=None) # Define the TfidfVectorizer object with the desired settings
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train[y_train == 'spam']) # Fit and Transform
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
sorted_by_tfidf = X_train_tfidf.max(0).toarray()[0].argsort() # Sort in ascending order
print('Best 20 features for spam')
print(feature_names[sorted_by_tfidf[-20:]])

Best 20 features for spam
['0870' 'nothing' 'bid' 'Prize' 'content' 'SavaMob' '87077' 'ac' 'THE'
 'SMS ac' 'need' 'WAP' 'for' 'premium' 'PARIS' 'Day' 'wap' '88066' 'do'
 'text']


In [28]:
# Best 20 features for predicting ham
tfidf_vectorizer.fit(X_train[y_train == 'ham']) # Define the TfidfVectorizer object with the desired settings
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train[y_train == 'ham']) # Fit and Transform
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
sorted_by_tfidf = X_train_tfidf.max(0).toarray()[0].argsort() # Sort in ascending order
print('Best 20 features for predicting ham')
print(feature_names[sorted_by_tfidf[-20:]])

Best 20 features for predicting ham
['BABE' 'Which' 'pa' 'You' 'lei' 'coast' 'Ok' 'class' 'unsold' 'say' 'you'
 'for' 'Nite' 'lousy' 'Okie' 'Thank' 'Thanx' 'too' 'or' 'out']


The results of the experiment show that there are several parameters that have a significant impact on the classification performance of spam SMS messages. First, the vectorizer parameters appear to have a large benefit, with lower min_df values, higher n-gram ranges, and larger numbers of features leading to better F1-scores, AUC, and accuracies. Second, the balancing strategies also appear to have a moderate impact, with SMOTE providing the best performance in all cases. Third, the choice of algorithm has a relatively small effect, with both MultinomialNB and ComplementNB performing similarly. The best model was obtained with the TFIDF vectorizer, a min_df value of 1, n-gram range of (1,2), and no maximum number of features. The data was also balanced using SMOTE. The resulting model achieved an F1-score of 0.978, an AUC of 0.979, and an accuracy of 0.981. The confusion matrix for this model shows that most of the errors are false negatives, indicating that the model is slightly biased towards predicting ham messages. In conclusion, these results suggest that the choice of vectorizer parameters and balancing strategy are more important than the choice of algorithm for classifying spam SMS messages.