In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score
import os

print(os.getcwd())

dataset_path = "/content/Spam-Classification.csv"
data = pd.read_csv(dataset_path)


data.rename(columns={"CLASS": "label", "SMS": "message"}, inplace=True)

data = pd.get_dummies(data, columns=["label"], prefix=["label"])

print(data.columns)

messages = data['message']
labels = data[['label_ham', 'label_spam']]

X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding="post", truncating="post")
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding="post", truncating="post")

model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

def predict_message(message):
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding="post", truncating="post")
    prediction = model.predict(padded_sequence)[0]
    return "Spam" if prediction[1] > prediction[0] else "Ham"

sample_message = "Congratulations! You've won a $1000 Walmart gift card. Call now!"
result = predict_message(sample_message)
print(f"The message '{sample_message}' is classified as: {result}")

/content
Index(['message', 'label_ham', 'label_spam'], dtype='object')
Epoch 1/5




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - accuracy: 0.5068 - loss: 0.6945 - val_accuracy: 0.4767 - val_loss: 0.6939
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.5065 - loss: 0.6944 - val_accuracy: 0.4767 - val_loss: 0.6943
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.5185 - loss: 0.6931 - val_accuracy: 0.4767 - val_loss: 0.6944
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 96ms/step - accuracy: 0.4758 - loss: 0.6948 - val_accuracy: 0.4767 - val_loss: 0.6933
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step - accuracy: 0.4774 - loss: 0.6965 - val_accuracy: 0.4767 - val_loss: 0.6946
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4523 - loss: 0.6958
Test Accuracy: 47.67%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step
The mes