In [1]:
import re
import json
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
# 1. Load Data (Adjusted for spam.txt)
# Most spam.txt files are Tab Separated. If yours uses commas, change sep='\t' to sep=','
df = pd.read_csv("spam.txt", sep='\t', names=["label", "message"], encoding="latin-1")
df["label"] = df["label"].map({"ham": 0, "spam": 1})

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned"] = df["message"].apply(clean_text)

In [3]:
# 2. Preprocessing
VOCAB_SIZE = 8000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned"])

In [4]:
# SAVE TOKENIZER for JS
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer.word_index, f)

X_seq = tokenizer.texts_to_sequences(df["cleaned"])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, padding="post")

In [5]:
# 3. Model
model = Sequential([
    Embedding(VOCAB_SIZE, 16, input_length=MAX_LEN), # Smaller embedding for light web use
    GlobalAveragePooling1D(),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_pad, df["label"], epochs=10, batch_size=32)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1abc7d33ee0>

In [6]:
# 4. Save Model
model.save("spam_model.h5")
print("Model and Tokenizer saved!")

Model and Tokenizer saved!


  saving_api.save_model(
