In [1]:
!pip install numpy pandas matplotlib scikit-learn keras tensorflow



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [3]:
# Load dataset
data = pd.read_csv(r"C:\Users\Aryan Deshpande\OneDrive\Documents\22BAI10095 - VIT Bhopal University\Projects\SMS Spam Classifier (Using CNN)\sample_texts.csv", encoding='ISO-8859-1')

# Drop unnecessary columns
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename columns for clarity
data.columns = ['label', 'message']

# Remove duplicate entries
data = data.drop_duplicates()

# Encode labels (ham = 0, spam = 1)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Display data sample
data.sample(5)

Unnamed: 0,label,message
1473,0,"Will do, you gonna be at blake's all night? I ..."
2632,0,I WILL CAL YOU SIR. In meeting
2663,1,8007 FREE for 1st week! No1 Nokia tone 4 ur mo...
5411,0,I ask if u meeting da ge tmr nite...
4855,0,yes baby! I need to stretch open your pussy!


In [4]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['message'])
X = tokenizer.texts_to_sequences(data['message'])

# Padding to make all sequences uniform in length
X = pad_sequences(X, maxlen=100) 

# Convert labels into numpy array
y = np.array(data['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Define model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()



In [6]:
# Define early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, callbacks=[early_stop])

Epoch 1/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.8762 - loss: 0.3372 - val_accuracy: 0.9816 - val_loss: 0.0596
Epoch 2/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.9918 - loss: 0.0444 - val_accuracy: 0.9903 - val_loss: 0.0439
Epoch 3/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.9965 - loss: 0.0190 - val_accuracy: 0.9855 - val_loss: 0.0501
Epoch 4/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.9988 - loss: 0.0113 - val_accuracy: 0.9855 - val_loss: 0.0573
Epoch 5/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9999 - loss: 0.0016 - val_accuracy: 0.9865 - val_loss: 0.0515


<keras.src.callbacks.history.History at 0x199f854f770>

In [7]:
# Evaluate model performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9901 - loss: 0.0526
Test Accuracy: 99.03%


In [8]:
model.save("spam_classifier.keras")

In [9]:
import pickle

# Save the tokenizer
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)

In [10]:
from keras.models import load_model

model = load_model("spam_classifier.keras")  # Use .keras format

In [11]:
import pickle

# Load the saved tokenizer
with open("tokenizer.pkl", "rb") as handle:
    tokenizer = pickle.load(handle)

In [12]:
# New SMS message to test
new_message = ["You have won a free lottery! Call now to claim your prize."]

# Convert to sequence
sequence = tokenizer.texts_to_sequences(new_message)

# Pad the sequence
padded_sequence = pad_sequences(sequence, maxlen=100)

# Predict using the trained model
prediction = model.predict(padded_sequence)

# Convert probability to label
label = "Spam" if prediction[0][0] > 0.5 else "Not Spam"

print(f"Prediction: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step
Prediction: Spam


In [13]:
# Get user input
user_message = input("Enter the message you want to check for spam: ")

# Convert to sequence
test_sequence = tokenizer.texts_to_sequences([user_message])  # Note: Wrap in a list

# Pad sequence
padded_test_sequence = pad_sequences(test_sequence, maxlen=100)

# Predict
prediction = model.predict(padded_test_sequence)

# Convert probability to label
label = "Spam" if prediction[0][0] > 0.5 else "Not Spam"  # Assuming binary classification

# Display result
print(f"\nMessage: {user_message}\nPrediction: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step

Message: Please pick up the call
Prediction: Not Spam
