In [13]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [14]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Constants
VOCAB_SIZE = 3000
MAX_LEN = 100
EMBEDDING_DIM = 128

# NEW: List of common spam keywords
SPAM_KEYWORDS = [
    'free', 'win', 'won', 'winner', 'cash', 'prize', 'claim', 'congratulations',
    'urgent', 'offer', 'only', 'click', 'debt', 'call', 'reply', 'stop',
    'sex', 'explicit', 'account', 'credit', 'loan', 'guarantee', 'money', '100%',
    'million', 'dollars', 'pounds', 'new', 'customer'
]



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
# Text preprocessing
def preprocess_text(text):
    has_spam_keyword = any(keyword in text.lower() for keyword in SPAM_KEYWORDS)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    if has_spam_keyword:
        tokens.insert(0, 'spamsignal')
    return ' '.join(tokens)

# Load and prepare data
def load_data():
    !wget -q https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
    !wget -q https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
    
    train_df = pd.read_csv('train-data.tsv', sep='\t', header=None, names=['label', 'message'])
    test_df = pd.read_csv('valid-data.tsv', sep='\t', header=None, names=['label', 'message'])
    train_df['label'] = train_df['label'].map({'ham': 0, 'spam': 1})
    test_df['label'] = test_df['label'].map({'ham': 0, 'spam': 1})
    return train_df, test_df



In [16]:
# Build the model
def build_model():
    model = Sequential([
        Embedding(VOCAB_SIZE + 1, EMBEDDING_DIM, input_length=MAX_LEN),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(32)),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy', 'Precision', 'Recall']
    )
    
    return model

# Main training function
def train_and_save_model():
    print("Starting model training...")
    train_df, test_df = load_data()
    all_messages = pd.concat([train_df['message'], test_df['message']])
    all_labels = pd.concat([train_df['label'], test_df['label']])
    all_messages_processed = all_messages.apply(preprocess_text)
    X_train, X_val, y_train, y_val = train_test_split(
        all_messages_processed, all_labels, test_size=0.2, random_state=42, stratify=all_labels)
    
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
    tokenizer.word_index['spamsignal'] = len(tokenizer.word_index) + 1
    tokenizer.fit_on_texts(X_train)
    
    train_sequences = tokenizer.texts_to_sequences(X_train)
    train_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
    val_sequences = tokenizer.texts_to_sequences(X_val)
    val_padded = pad_sequences(val_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
    
    model = build_model()
    
    history = model.fit(
        train_padded,
        y_train,
        epochs=10,
        validation_data=(val_padded, y_val),
        callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
        batch_size=64,
        verbose=1
    )
    
    print("\nTraining complete.")
    return model, tokenizer

# This will run the training and return the model and tokenizer
model, tokenizer = train_and_save_model()

Starting model training...
Epoch 1/10




[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 222ms/step - Precision: 0.3946 - Recall: 0.1701 - accuracy: 0.8399 - loss: 0.4007 - val_Precision: 0.9739 - val_Recall: 0.7467 - val_accuracy: 0.9632 - val_loss: 0.1204
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 225ms/step - Precision: 0.9735 - Recall: 0.8774 - accuracy: 0.9811 - loss: 0.0775 - val_Precision: 0.9643 - val_Recall: 0.9000 - val_accuracy: 0.9821 - val_loss: 0.0518
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 225ms/step - Precision: 0.9791 - Recall: 0.9506 - accuracy: 0.9906 - loss: 0.0392 - val_Precision: 0.9853 - val_Recall: 0.8933 - val_accuracy: 0.9839 - val_loss: 0.0509
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 224ms/step - Precision: 0.9884 - Recall: 0.9756 - accuracy: 0.9948 - loss: 0.0218 - val_Precision: 0.9854 - val_Recall: 0.9000 - val_accuracy: 0.9848 - val_loss: 0.0633
Epoch 5/10
[1m70/70[0m [32m━

In [17]:

    # Gradio interface
    def predict_message(text):
        try:
            # Preprocess
            processed_text = preprocess_text(text)
            # Tokenize
            sequence = tokenizer.texts_to_sequences([processed_text])
            padded = pad_sequences(sequence, maxlen=MAX_LEN, padding='post', truncating='post')
            # Predict
            prediction = model.predict(padded, verbose=0)[0][0]
            # Format results
            label = "SPAM" if prediction > 0.5 else "HAM"
            confidence = prediction if label == "SPAM" else 1 - prediction
            color = "#FF5733" if label == "SPAM" else "#33FF57"
            
            return (label, 
                    f"{confidence:.2%}", 
                    f"<div style='background-color:{color}; padding:20px; border-radius:10px; text-align:center;'>{label}</div>")
        except Exception as e:
            print(f"Error: {e}")
            return ("ERROR", "0%", "<div style='background-color:#CCCCCC; padding:20px; border-radius:10px; text-align:center;'>ERROR</div>")
    
    # Create interface
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 📱 SMS Spam Classifier")
        gr.Markdown("Enter a message to check if it's spam or legitimate (ham)")
        
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Message", lines=3)
                submit_btn = gr.Button("Classify", variant="primary")
                gr.Examples(
                    examples=[
                        "WINNER!! You won 1 million dollars! Click here to claim!",
                        "Hi, how are you doing today?",
                        "URGENT: Your bank account has been compromised",
                        "Your package will arrive tomorrow"
                    ],
                    inputs=input_text
                )
            
            with gr.Column():
                output_label = gr.Label(label="Result")
                confidence = gr.Textbox(label="Confidence")
                output_display = gr.HTML()
        
        submit_btn.click(
            fn=predict_message,
            inputs=input_text,
            outputs=[output_label, confidence, output_display]
        )
    
    demo.launch()
    if __name__ == "__main__":
        main()

* Running on local URL:  http://127.0.0.1:7862
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://d2ad261e7d0760130a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Epoch 1/10




[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 249ms/step - Precision: 0.4902 - Recall: 0.1728 - accuracy: 0.8620 - loss: 0.3677 - val_Precision: 0.8874 - val_Recall: 0.8933 - val_accuracy: 0.9704 - val_loss: 0.0950
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 205ms/step - Precision: 0.9626 - Recall: 0.8942 - accuracy: 0.9814 - loss: 0.0752 - val_Precision: 0.9716 - val_Recall: 0.9133 - val_accuracy: 0.9848 - val_loss: 0.0514
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 211ms/step - Precision: 0.9701 - Recall: 0.9502 - accuracy: 0.9891 - loss: 0.0413 - val_Precision: 0.9658 - val_Recall: 0.9400 - val_accuracy: 0.9874 - val_loss: 0.0458
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 197ms/step - Precision: 0.9992 - Recall: 0.9746 - accuracy: 0.9965 - loss: 0.0116 - val_Precision: 0.9714 - val_Recall: 0.9067 - val_accuracy: 0.9839 - val_loss: 0.0633
Epoch 5/10
[1m70/70[0m [32m━