In [4]:
!pip install numpy pandas matplotlib scikit-learn keras tensorflow





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping


In [8]:
# Load dataset
data = pd.read_csv(r"C:\Users\Aryan Deshpande\Downloads\spam.csv", encoding='ISO-8859-1')

# Drop unnecessary columns
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename columns for clarity
data.columns = ['label', 'message']

# Remove duplicate entries
data = data.drop_duplicates()

# Encode labels (ham = 0, spam = 1)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Display data sample
data.sample(5)


Unnamed: 0,label,message
4580,0,Not course. Only maths one day one chapter wit...
1045,0,"Come round, it's ."
209,0,Both :) i shoot big loads so get ready!
3135,0,Can you let me know details of fri when u find...
3751,0,Why are u up so early?


In [9]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['message'])
X = tokenizer.texts_to_sequences(data['message'])

# Padding to make all sequences uniform in length
X = pad_sequences(X, maxlen=100) 

# Convert labels into numpy array
y = np.array(data['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Define model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()




In [11]:
# Define early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, callbacks=[early_stop])


Epoch 1/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.9135 - loss: 0.2942 - val_accuracy: 0.9836 - val_loss: 0.0566
Epoch 2/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9924 - loss: 0.0403 - val_accuracy: 0.9691 - val_loss: 0.1049
Epoch 3/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9920 - loss: 0.0245 - val_accuracy: 0.9894 - val_loss: 0.0414
Epoch 4/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9992 - loss: 0.0070 - val_accuracy: 0.9826 - val_loss: 0.0721
Epoch 5/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9995 - loss: 0.0029 - val_accuracy: 0.9845 - val_loss: 0.0634
Epoch 6/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9998 - loss: 0.0034 - val_accuracy: 0.9845 - val_loss: 0.0646


<keras.src.callbacks.history.History at 0x239d8af5670>

In [12]:
# Evaluate model performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9889 - loss: 0.0519
Test Accuracy: 98.94%


In [15]:
model.save("spam_classifier.keras")

In [18]:
import pickle

# Save the tokenizer
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)

In [19]:
from keras.models import load_model

model = load_model("spam_classifier.keras")  # Use .keras format

In [20]:
import pickle

# Load the saved tokenizer
with open("tokenizer.pkl", "rb") as handle:
    tokenizer = pickle.load(handle)

In [21]:
# New SMS message to test
new_message = ["You have won a free lottery! Call now to claim your prize."]

# Convert to sequence
sequence = tokenizer.texts_to_sequences(new_message)

# Pad the sequence
padded_sequence = pad_sequences(sequence, maxlen=100)

# Predict using the trained model
prediction = model.predict(padded_sequence)

# Convert probability to label
label = "Spam" if prediction[0][0] > 0.5 else "Not Spam"

print(f"Prediction: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
Prediction: Spam


In [22]:
test_messages = [
    "Congratulations! You have won a cash prize. Click the link to claim.",
    "Hey, how are you? Let's meet up later.",
    "URGENT: Your bank account will be blocked unless you verify immediately."
]

# Convert to sequences
test_sequences = tokenizer.texts_to_sequences(test_messages)

# Pad sequences
padded_test_sequences = pad_sequences(test_sequences, maxlen=100)

# Predict
predictions = model.predict(padded_test_sequences)

# Convert probabilities to labels
labels = ["Spam" if pred > 0.5 else "Not Spam" for pred in predictions]

# Display results
for msg, label in zip(test_messages, labels):
    print(f"Message: {msg}\nPrediction: {label}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step
Message: Congratulations! You have won a cash prize. Click the link to claim.
Prediction: Spam

Message: Hey, how are you? Let's meet up later.
Prediction: Not Spam

Message: URGENT: Your bank account will be blocked unless you verify immediately.
Prediction: Not Spam

