In [12]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.20.0-dev20250516


In [13]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-05-16 18:17:24--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv.1’


2025-05-16 18:17:24 (56.5 MB/s) - ‘train-data.tsv.1’ saved [358233/358233]

--2025-05-16 18:17:24--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv.1’


2025-05-16 18:17:24 (34.5 MB/s) - ‘valid-data.tsv.1’ saved [118774/118774]



In [14]:
import re

# Preprocess text to remove punctuation and lowercase
def preprocess_text(texts):
    return [re.sub(r"[^a-zA-Z0-9\s]", "", text.lower()) for text in texts]

# Load the data into pandas DataFrames
train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

# Encode labels: 'ham' → 0, 'spam' → 1
train_data['label'] = train_data['label'].map({'ham': 0, 'spam': 1})
test_data['label'] = test_data['label'].map({'ham': 0, 'spam': 1})

# Preprocess messages
train_messages = preprocess_text(train_data['message'].values)
test_messages = preprocess_text(test_data['message'].values)

# Text vectorization
from tensorflow.keras.layers import TextVectorization

max_vocab_size = 1000
max_sequence_length = 100

vectorizer = TextVectorization(max_tokens=max_vocab_size, output_mode='int', output_sequence_length=max_sequence_length)
vectorizer.adapt(train_messages)

# Vectorize messages
X_train = vectorizer(train_messages)
X_test = vectorizer(test_messages)

y_train = train_data['label'].values
y_test = test_data['label'].values

In [15]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Embedding(input_dim=max_vocab_size, output_dim=64, input_length=max_sequence_length),
    keras.layers.Bidirectional(keras.layers.LSTM(32, return_sequences=True)),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Define prediction function
def predict_message(pred_text):
    cleaned_text = preprocess_text([pred_text])  # clean before predicting
    vectorized_text = vectorizer(cleaned_text)
    prediction = model.predict(vectorized_text)[0][0]
    label = "spam" if prediction > 0.5 else "ham"
    return [float(prediction), label]



Epoch 1/5




[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8729 - loss: 0.4172 - val_accuracy: 0.9684 - val_loss: 0.0988
Epoch 2/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9775 - loss: 0.0772 - val_accuracy: 0.9799 - val_loss: 0.0620
Epoch 3/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9868 - loss: 0.0443 - val_accuracy: 0.9828 - val_loss: 0.0568
Epoch 4/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9919 - loss: 0.0306 - val_accuracy: 0.9813 - val_loss: 0.0641
Epoch 5/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9926 - loss: 0.0245 - val_accuracy: 0.9828 - val_loss: 0.0610


In [16]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

# Test a single message

pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step
[0.0008196687558665872, 'ham']


In [17]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
You passed the challenge. Great job!


In [18]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9820 - loss: 0.0642
Test Accuracy: 0.982758641242981


In [19]:

# Predict probabilities on TRAINING DATA
train_predictions = model.predict(X_train)

# Convert probabilities to labels based on threshold 0.5
train_pred_labels = ["spam" if p > 0.5 else "ham" for p in train_predictions.flatten()]

# To compare with actual labels
actual_labels = ["spam" if l == 1 else "ham" for l in y_train]

# Let's print some predictions and their actual labels
for i in range(10):
    print(f"Message: {train_data['message'].iloc[i]}")
    print(f"Actual: {actual_labels[i]}, Predicted: {train_pred_labels[i]}")
    print(f"Prediction Score: {train_predictions[i][0]:.4f}")
    print()


[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Message: ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.
Actual: ham, Predicted: ham
Prediction Score: 0.0009

Message: you can never do nothing
Actual: ham, Predicted: ham
Prediction Score: 0.0009

Message: now u sound like manky scouse boy steve,like! i is travelling on da bus home.wot has u inmind 4 recreation dis eve?
Actual: ham, Predicted: ham
Prediction Score: 0.0024

Message: mum say we wan to go then go... then she can shun bian watch da glass exhibition...
Actual: ham, Predicted: ham
Prediction Score: 0.0001

Message: never y lei... i v lazy... got wat? dat day ü send me da url cant work one...
Actual: ham, Predicted: ham
Prediction Score: 0.0001

Message: in xam hall boy asked girl tell me the starting term for dis answer i can den manage on my own after lot of hesitation n lookin around silently she sai

In [20]:
# Predict on the TESTING DATA
test_predictions = model.predict(X_test)

# Convert predicted probabilities to labels
test_pred_labels = ["spam" if p > 0.5 else "ham" for p in test_predictions.flatten()]

# Convert actual labels to 'ham' or 'spam'
actual_test_labels = ["spam" if l == 1 else "ham" for l in y_test]

# Print the first 10 predictions with actual values
for i in range(10):
    print(f"Message: {test_data['message'].iloc[i]}")
    print(f"Actual: {actual_test_labels[i]}, Predicted: {test_pred_labels[i]}")
    print(f"Prediction Score: {test_predictions[i][0]:.4f}")
    print()


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Message: i am in hospital da. . i will return home in evening
Actual: ham, Predicted: ham
Prediction Score: 0.0001

Message: not much, just some textin'. how bout you?
Actual: ham, Predicted: ham
Prediction Score: 0.0010

Message: i probably won't eat at all today. i think i'm gonna pop. how was your weekend? did u miss me?
Actual: ham, Predicted: ham
Prediction Score: 0.0001

Message: don‘t give a flying monkeys wot they think and i certainly don‘t mind. any friend of mine and all that!
Actual: ham, Predicted: ham
Prediction Score: 0.0046

Message: who are you seeing?
Actual: ham, Predicted: ham
Prediction Score: 0.0151

Message: your opinion about me? 1. over 2. jada 3. kusruthi 4. lovable 5. silent 6. spl character 7. not matured 8. stylish 9. simple pls reply..
Actual: ham, Predicted: ham
Prediction Score: 0.0037

Message: yesterday its with me only . now am going home.
Actual: ham, Predicted: ham
Prediction S