In [1]:
import pandas as pd


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
#Load Spambase turn into dataframe
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
columns = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our", 
           "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail", 
           "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses", 
           "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", 
           "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", 
           "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", 
           "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", 
           "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", 
           "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", 
           "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(", 
           "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", "capital_run_length_average", 
           "capital_run_length_longest", "capital_run_length_total", "spam"]
data = pd.read_csv(url, header=None, names=columns)

Let's load in our data 

In [7]:
#Preprocess
X = data.drop(columns=["spam"])
y = data["spam"]

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



Split the data into train test split. Lets try random forest first

In [9]:
#Go for easy: RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=120, random_state=45)

#Train
classifier.fit(X_train, y_train)

#Test
y_pred = classifier.predict(X_test)

#Score
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9543808834178131
Confusion Matrix:
[[782  22]
 [ 41 536]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       804
           1       0.96      0.93      0.94       577

    accuracy                           0.95      1381
   macro avg       0.96      0.95      0.95      1381
weighted avg       0.95      0.95      0.95      1381



Now lets try SVM

In [10]:
#set up SVM
classifier = SVC(kernel='linear', random_state=42)

#Train
classifier.fit(X_train, y_train)

#Predcit
y_pred = classifier.predict(X_test)

#Evaluate
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9268645908761767
Confusion Matrix:
[[766  38]
 [ 63 514]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94       804
           1       0.93      0.89      0.91       577

    accuracy                           0.93      1381
   macro avg       0.93      0.92      0.92      1381
weighted avg       0.93      0.93      0.93      1381



Let's Try Gradient Boost

In [12]:
#Gradient Boost
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

#Train
classifier.fit(X_train, y_train)

#Predict
y_pred = classifier.predict(X_test)

#Score
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.946415640839971
Confusion Matrix:
[[776  28]
 [ 46 531]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       804
           1       0.95      0.92      0.93       577

    accuracy                           0.95      1381
   macro avg       0.95      0.94      0.94      1381
weighted avg       0.95      0.95      0.95      1381



Let's Try knn

In [13]:
#KNN
knn_classifier = KNeighborsClassifier(n_neighbors=10)

#Train
knn_classifier.fit(X_train, y_train)

#Predict
y_pred_knn = knn_classifier.predict(X_test)

#Score
accuracy_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
class_report_knn = classification_report(y_test, y_pred_knn)

print(f"KNN Accuracy: {accuracy_knn}")
print("KNN Confusion Matrix:")
print(conf_matrix_knn)
print("KNN Classification Report:")
print(class_report_knn)

KNN Accuracy: 0.7610427226647357
KNN Confusion Matrix:
[[700 104]
 [226 351]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       804
           1       0.77      0.61      0.68       577

    accuracy                           0.76      1381
   macro avg       0.76      0.74      0.74      1381
weighted avg       0.76      0.76      0.76      1381



All in all, random forest did the best job at identifying spam. It has the highest F1 score, coupled with the highest precision. All in all, I feel like it may be a bit overfit, but as a quick test, it's pretty reasonable. This performance makes it a strong candidate for our initial model, though further tuning and validation may be necessary to ensure its robustness and generalizability across different datasets.

It's important to note that spam detection has become increasingly challenging as spam itself has gotten better and smarter. Modern spam employs more sophisticated techniques to evade detection, making it crucial for models to adapt and evolve continuously.

FYI totally funny defcon talk on spam:https://www.youtube.com/watch?v=fQPspL_VuD8