In [6]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# 0. Download stopwords if not already done
nltk.download('stopwords')

# 1. Load the data
data = pd.read_csv('Emails.csv')  # ensure this file exists in working dir
data = data[['label', 'text']].dropna()  # drop missing values, changed 'message' to 'text'
data['label'] = data['label'].map({'ham': 0, 'spam': 1})  # map labels

# 2. Text cleaning function
stop_words = set(stopwords.words('english'))
def clean_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

data['cleaned'] = data['text'].apply(clean_text) # changed 'message' to 'text'

# 3. Tokenize and pad sequences
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(data['cleaned'])

sequences = tokenizer.texts_to_sequences(data['cleaned'])
X = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post')
y = data['label'].values

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Build the Sequential model
model = models.Sequential([
    layers.Embedding(input_dim=MAX_VOCAB, output_dim=16, input_length=MAX_LEN),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# 6. Train the model with validation split
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.1,
    verbose=2
)

# 7. Evaluate on test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\n✅ Test Accuracy: {accuracy * 100:.2f}%")

# 8. Prediction helper function
def predict_email(text: str) -> str:
    text_clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    pad = pad_sequences(seq, maxlen=MAX_LEN, truncating='post')
    pred = model.predict(pad, verbose=0)[0][0]
    return "SPAM" if pred > 0.5 else "HAM"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


None
Epoch 1/10
117/117 - 3s - 25ms/step - accuracy: 0.7886 - loss: 0.5748 - val_accuracy: 0.8357 - val_loss: 0.4317
Epoch 2/10
117/117 - 0s - 4ms/step - accuracy: 0.8681 - loss: 0.3537 - val_accuracy: 0.9010 - val_loss: 0.2376
Epoch 3/10
117/117 - 0s - 4ms/step - accuracy: 0.9363 - loss: 0.2113 - val_accuracy: 0.9710 - val_loss: 0.1454
Epoch 4/10
117/117 - 1s - 5ms/step - accuracy: 0.9651 - loss: 0.1401 - val_accuracy: 0.9758 - val_loss: 0.1026
Epoch 5/10
117/117 - 0s - 4ms/step - accuracy: 0.9758 - loss: 0.1012 - val_accuracy: 0.9855 - val_loss: 0.0801
Epoch 6/10
117/117 - 1s - 5ms/step - accuracy: 0.9828 - loss: 0.0772 - val_accuracy: 0.9783 - val_loss: 0.0717
Epoch 7/10
117/117 - 0s - 4ms/step - accuracy: 0.9871 - loss: 0.0643 - val_accuracy: 0.9928 - val_loss: 0.0579
Epoch 8/10
117/117 - 1s - 5ms/step - accuracy: 0.9874 - loss: 0.0530 - val_accuracy: 0.9831 - val_loss: 0.0507
Epoch 9/10
117/117 - 1s - 6ms/step - accuracy: 0.9879 - loss: 0.0472 - val_accuracy: 0.9807 - val_loss: 0.