In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
import joblib

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Spam Email Detection/Spam Email Detection - spam.csv', usecols=['v1', 'v2'], encoding='utf-8')
df.columns = ['Target', 'Email']
df

Unnamed: 0,Target,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.loc[2]['Email']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [5]:
df['Target'].value_counts()

Target
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
df.isna().sum()

Target    0
Email     0
dtype: int64

In [7]:
X = df['Email']
y = df['Target'].map({'ham':0, 'spam':1})

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [10]:
VOCAB_SIZE = 200
encoder = keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(X)

In [12]:
encoder

<keras.src.layers.preprocessing.text_vectorization.TextVectorization at 0x784781dc7d00>

In [14]:
model = keras.Sequential([
    keras.layers.Input(shape=(1), dtype='string'),
    encoder,
    keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [15]:
model.compile(loss=keras.losses.BinaryCrossentropy(),
              optimizer=keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [16]:
history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_val, y_val),
                    batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
model.evaluate(X_test, y_test)



[0.0856490284204483, 0.9766815900802612]

In [18]:
# Example text for prediction
text_to_predict ="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Preprocess the text using the encoder
encoded_text = pd.Series(text_to_predict)

# Predict using the trained model
predicted_logits = model.predict(encoded_text)
predicted_logits



array([[0.98651075]], dtype=float32)

In [19]:
predicted_probability_spam = predicted_logits[0][0]
print(f"The predicted probability of being spam: {predicted_probability_spam}")

# Convert probabilities to class labels
predicted_class = "spam" if predicted_probability_spam > 0.5 else "ham"

# Print the prediction
print(f"The predicted class for the text '{text_to_predict}' is: {predicted_class}")


The predicted probability of being spam: 0.9865107536315918
The predicted class for the text 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's' is: spam


In [3]:
# model.save('model.keras')
load_model = keras.models.load_model('/content/drive/MyDrive/Spam Email Detection/model.keras')

In [21]:
# load_model.evaluate(X_test, y_test)



[0.0856490284204483, 0.9766815900802612]

In [4]:
# Example text for prediction
text_to_predict ="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Preprocess the text using the encoder
encoded_text = pd.Series(text_to_predict)

# Predict using the trained model
predicted_logits = load_model.predict(encoded_text)
predicted_logits



array([[0.98651075]], dtype=float32)