In [1]:
# ------------------------------------
# LSTM for SMS Spam Classification
# ------------------------------------

import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import urllib.request

# Download SMS Spam dataset
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
filename = "sms.tsv"
urllib.request.urlretrieve(url, filename)

# Load dataset
df = pd.read_csv("sms.tsv", sep="\t", names=["label", "message"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

texts = df['message'].values
labels = df['label'].values

# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences
MAXLEN = 100
X = pad_sequences(sequences, maxlen=MAXLEN)
y = np.array(labels)

# Train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = models.Sequential([
    layers.Embedding(10000, 64, input_length=MAXLEN),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=64,
                    validation_data=(x_test, y_test))

# Final evaluation
loss, acc = model.evaluate(x_test, y_test)
print("Final Test Accuracy:", acc)


Epoch 1/5




[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.8475 - loss: 0.4340 - val_accuracy: 0.9767 - val_loss: 0.0863
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9848 - loss: 0.0655 - val_accuracy: 0.9892 - val_loss: 0.0385
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9958 - loss: 0.0208 - val_accuracy: 0.9910 - val_loss: 0.0339
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9971 - loss: 0.0099 - val_accuracy: 0.9919 - val_loss: 0.0376
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9990 - loss: 0.0064 - val_accuracy: 0.9919 - val_loss: 0.0442
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9918 - loss: 0.0495
Final Test Accuracy: 0.9919282793998718


In [2]:
# ---------------------------
# Inference on Test Samples
# ---------------------------

def predict_message(msg):
    seq = tokenizer.texts_to_sequences([msg])
    padded = pad_sequences(seq, maxlen=MAXLEN)
    pred = model.predict(padded)[0][0]
    label = "SPAM" if pred > 0.5 else "HAM (Not spam)"
    return pred, label

sample_messages = [
    "Free entry in 2 a weekly contest to win a brand new car!!!",
    "Can we meet today evening?",
    "URGENT! Your account has been suspended. Verify now!",
    "Hey bro, I will call you after class."
]

for m in sample_messages:
    score, label = predict_message(m)
    print(f"\nMessage: {m}")
    print(f"Prediction Score: {score:.4f} → {label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step

Message: Free entry in 2 a weekly contest to win a brand new car!!!
Prediction Score: 0.9784 → SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

Message: Can we meet today evening?
Prediction Score: 0.0001 → HAM (Not spam)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step

Message: URGENT! Your account has been suspended. Verify now!
Prediction Score: 0.4417 → HAM (Not spam)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step

Message: Hey bro, I will call you after class.
Prediction Score: 0.0003 → HAM (Not spam)
