<a href="https://colab.research.google.com/github/Annabelle5474/SMS-and-Email-Spam-Detection/blob/main/SMS_and_Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import requests, zipfile
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC


# Data Preprocessing

In [None]:
# Load SMS Spam Collection Dataset
url_sms = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url_sms)
with zipfile.ZipFile(BytesIO(response.content), 'r') as z:
  with z.open("SMSSpamCollection") as f:
    data_sms = pd.read_csv(f, sep='\t', names=["label", "message"], header=None)
data_sms['label'] = data_sms['label'].map({'spam': 1, 'ham': 0})

In [None]:
# Load Enron Spam Dataset
url_enron = "https://github.com/mwiechmann/enron_spam_data/raw/master/enron_spam_data.zip"
response = requests.get(url_enron)
with zipfile.ZipFile(BytesIO(response.content), 'r') as z:
    file_name = [name for name in z.namelist() if name.endswith(".csv")][0]
    with z.open(file_name) as f:
        data_enron = pd.read_csv(f)

In [None]:
print("data_sms count: ", data_sms.shape)
print(data_sms.head())
print("data_enron count: ", data_enron.shape)
print(data_enron.head())

data_sms count:  (5572, 2)
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
data_enron count:  (33716, 5)
   Message ID                       Subject  \
0           0  christmas tree farm pictures   
1           1      vastar resources , inc .   
2           2  calpine daily gas nomination   
3           3                    re : issue   
4           4     meter 7268 nov allocation   

                                             Message Spam/Ham        Date  
0                                                NaN      ham  1999-12-10  
1  gary , production from the high island larger ...      ham  1999-12-13  
2             - calpine daily gas nomination 1 . doc      ham  1999-12-1

In [None]:
# Preprocess data_enron
data_enron = data_enron[['Message', 'Spam/Ham']]
data_enron.columns = ["message", "label"]
data_enron['label'] = data_enron['label'].map({"spam": 1, "ham": 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_enron['label'] = data_enron['label'].map({"spam": 1, "ham": 0})


In [None]:
# Combine both datasets
data = pd.concat([data_sms, data_enron], ignore_index = True)

# Check data
print(data.head())
print(data.shape)
print(data["label"].value_counts())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
(39288, 2)
label
0    21370
1    17918
Name: count, dtype: int64


# Preparations for Model Training

In [None]:
# Text Preprocessing
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, float):  # Check text is NaN or non-string
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Ensure no null values in 'message' column before applying the function
data.dropna(subset=['message'], inplace=True)
data['message'] = data['message'].astype(str).apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Feature Extraction
vec = TfidfVectorizer(max_features=5000)
X = vec.fit_transform(data['message'])
y = data['label']

In [None]:
# Split dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Model Setup

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
# Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [None]:
# MLP
mlp = make_pipeline(StandardScaler(with_mean=False), MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

In [None]:
# Support Vector Machine
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

In [None]:
# Evaluate
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1-score: {f1_score(y_test, y_pred):.4f}\n")

# Run Models

In [None]:
# Evaluate Models on the Validation Set (optional tuning stage)
evaluate_model(log_reg, X_val, y_val, "Logistic Regression (Validation)")
evaluate_model(mlp, X_val, y_val, "Multi-Layer Perceptron (Validation)")
evaluate_model(nb_model, X_val, y_val, "Multinomial Naive Bayes (Validation)")
evaluate_model(svm_model, X_val, y_val, "Support Vector Machine (Validation)")

Logistic Regression (Validation) Performance:
Accuracy: 0.9700
Precision: 0.9691
Recall: 0.9643
F1-score: 0.9667

Multi-Layer Perceptron (Validation) Performance:
Accuracy: 0.9608
Precision: 0.9577
Recall: 0.9552
F1-score: 0.9565

Multinomial Naive Bayes (Validation) Performance:
Accuracy: 0.9484
Precision: 0.9174
Recall: 0.9734
F1-score: 0.9446

Support Vector Machine (Validation) Performance:
Accuracy: 0.9723
Precision: 0.9736
Recall: 0.9647
F1-score: 0.9691



In [None]:
# Evaluate Models on the Test Set (final performance)
evaluate_model(log_reg, X_test, y_test, "Logistic Regression (Test)")
evaluate_model(mlp, X_test, y_test, "Multi-Layer Perceptron (Test)")
evaluate_model(nb_model, X_test, y_test, "Multinomial Naive Bayes (Test)")
evaluate_model(svm_model, X_test, y_test, "Support Vector Machine (Test)")

Logistic Regression (Test) Performance:
Accuracy: 0.9587
Precision: 0.9585
Recall: 0.9485
F1-score: 0.9535

Multi-Layer Perceptron (Test) Performance:
Accuracy: 0.9585
Precision: 0.9494
Recall: 0.9581
F1-score: 0.9537

Multinomial Naive Bayes (Test) Performance:
Accuracy: 0.9334
Precision: 0.8965
Recall: 0.9616
F1-score: 0.9279

Support Vector Machine (Test) Performance:
Accuracy: 0.9642
Precision: 0.9665
Recall: 0.9527
F1-score: 0.9596

