In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
print(tf.__version__)  # should show 2.15.x


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_data = pd.read_csv(train_file_path, sep= "\t", names=["label", "message"])
test_data = pd.read_csv(test_file_path, sep= "\t", names=["label", "message"])

train_data.head()
test_data.head()

In [None]:
train_data["label"]= train_data["label"].map({"spam": 1, "ham": 0})
test_data["label"]= test_data["label"].map({"spam": 1, "ham": 0})

train_text = train_data["message"].values
train_label = train_data["label"].values
test_text = test_data["message"].values
test_label = test_data["label"].values

In [None]:
oov_tok= '<OOV>'
tokenizer = Tokenizer(num_words =10000, oov_token = oov_tok)
tokenizer.fit_on_texts(train_text)

train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

max_length = 1000
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length)
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 128, input_length=max_length, mask_zero= True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics= ["accuracy"])

# Train the model
model.fit(padded_train_sequences, train_label, batch_size=32, epochs=5, validation_data=(padded_test_sequences, test_label))

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(message):
    seq = tokenizer.texts_to_sequences([message])
    padded = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
    pred = model.predict(padded)[0][0]
    label = "spam" if pred >= 0.5 else "ham"
    return [float(pred), label]



pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won Â£1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
