In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [None]:

# Load and clean data
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [None]:

# Download NLTK stopwords (default path used)
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# Text preprocessing function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_message'] = df['message'].apply(clean_text)


In [None]:

# TF-IDF vectorization
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = tfidf.fit_transform(df['cleaned_message'])
y = df['label']


In [None]:

# Train/test split for traditional ML models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_acc = nb.score(X_test, y_test)



In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_acc = lr.score(X_test, y_test)



In [None]:
# Support Vector Machine
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_acc = svm.score(X_test, y_test)



In [None]:
# Tokenization and padding for LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_message'])
sequences = tokenizer.texts_to_sequences(df['cleaned_message'])
X_padded = pad_sequences(sequences, maxlen=100)



In [None]:
# Train/test split for LSTM
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_padded, y, test_size=0.2, random_state=42)



In [None]:
# LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_lstm, y_train_lstm, epochs=16, batch_size=64, validation_split=0.1)
lstm_acc = model.evaluate(X_test_lstm, y_test_lstm)[1]


Epoch 1/16




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 262ms/step - accuracy: 0.8767 - loss: 0.3513 - val_accuracy: 0.9619 - val_loss: 0.1298
Epoch 2/16
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 210ms/step - accuracy: 0.9844 - loss: 0.0534 - val_accuracy: 0.9709 - val_loss: 0.0915
Epoch 3/16
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 202ms/step - accuracy: 0.9951 - loss: 0.0234 - val_accuracy: 0.9731 - val_loss: 0.1085
Epoch 4/16
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 246ms/step - accuracy: 0.9990 - loss: 0.0066 - val_accuracy: 0.9686 - val_loss: 0.1029
Epoch 5/16
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 201ms/step - accuracy: 0.9991 - loss: 0.0045 - val_accuracy: 0.9686 - val_loss: 0.1078
Epoch 6/16
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 192ms/step - accuracy: 0.9988 - loss: 0.0053 - val_accuracy: 0.9753 - val_loss: 0.1264
Epoch 7/16
[1m63/63[0m [32m━━━

In [None]:

# Accuracy results
print(f"\nNaive Bayes Accuracy: {nb_acc:.4f}")
print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"SVM Accuracy: {svm_acc:.4f}")
print(f"LSTM Accuracy: {lstm_acc:.4f}")




Naive Bayes Accuracy: 0.9740
Logistic Regression Accuracy: 0.9570
SVM Accuracy: 0.9821
LSTM Accuracy: 0.9821


In [None]:
from sklearn.metrics import classification_report

print("Naive Bayes:\n", classification_report(y_test, y_pred_nb))
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("SVM:\n", classification_report(y_test, y_pred_svm))
print("LSTM:\n", classification_report(y_test_lstm, y_pred_lstm))



Naive Bayes:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Logistic Regression:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.97      0.70      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.89      1115
weighted avg       0.96      0.96      0.95      1115

SVM:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.9