In [5]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
data = pd.read_csv("Spam-Classification.csv")
data

Unnamed: 0,CLASS,SMS
0,ham,"said kiss, kiss, i can't do the sound effects..."
1,ham,&lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3,spam,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4,spam,**FREE MESSAGE**Thanks for using the Auction S...
...,...,...
1495,ham,"Yup, no need. I'll jus wait 4 e rain 2 stop."
1496,ham,Yup... From what i remb... I think should be c...
1497,ham,Yup... How Ã¼ noe leh...
1498,ham,Yup... Ok i go home look at the timings then i...


In [7]:
ham = data[data['CLASS'] == 'ham']
spam = data[data['CLASS'] == 'spam']
ham_sampled = ham.sample(len(spam), random_state=42)
balanced_data = pd.concat([ham_sampled, spam], axis=0).sample(frac=1, random_state=42)
balanced_data['CLASS'] = balanced_data['CLASS'].map({'ham': 0, 'spam': 1})


In [10]:
messages = balanced_data['SMS']
labels = balanced_data['CLASS']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(messages)
sequences = tokenizer.texts_to_sequences(messages)
vocab_size = len(tokenizer.word_index) + 1

max_length = 50  # Adjust based on message length distribution
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [11]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    padded_sequences, labels, test_size=0.15, random_state=42, stratify=labels
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val
)


In [12]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [13]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/5
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.4994 - loss: 0.6937 - val_accuracy: 0.5079 - val_loss: 0.6909
Epoch 2/5
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7524 - loss: 0.5634 - val_accuracy: 0.9162 - val_loss: 0.3055
Epoch 3/5
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.9287 - loss: 0.2632 - val_accuracy: 0.9372 - val_loss: 0.2512
Epoch 4/5
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9453 - loss: 0.2125 - val_accuracy: 0.9372 - val_loss: 0.2326
Epoch 5/5
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9342 - loss: 0.2569 - val_accuracy: 0.8901 - val_loss: 0.3128


In [15]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Test Accuracy: 92.89%


In [28]:
def classify_message(message):
    sequence = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    return "Spam" if prediction > 0.5 else "Ham"

sample_message = "Congratulations! You've won a free ticket. Reply YES to claim."
#sample_message ="**FREE MESSAGE**Thanks for using the Auction S"
#sample_message ="<Forwarded from 21870000>Hi - this is your Mai"
print(classify_message(sample_message))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Spam
