In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
#Read the file and transform the dataset to required format. 0: ham message, 1: spam message

df_train = pd.read_csv(train_file_path, sep="\t",names=["oldlabel", "text"])
df_test = pd.read_csv(test_file_path, sep="\t",names=["oldlabel", "text"])
def ham_to_numeric(text):
  if text == 'ham':
    return 0
  if text == 'spam':
    return 1
df_train['label'] = df_train['oldlabel'].map(ham_to_numeric)
df_train.pop('oldlabel')
df_test['label'] = df_test['oldlabel'].map(ham_to_numeric)
df_test.pop('oldlabel')
df_train.head(10)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# TOKENIZE
BATCH_SIZE = 64
BUFFER_SIZE = 5

#Change dataset to numpy format
train_text = df_train['text'].to_numpy()
train_labels = df_train['label'].to_numpy().flatten()
valid_text = df_test['text'].to_numpy()
valid_labels = df_test['label'].to_numpy().flatten()

#Set tokenizer
tok = Tokenizer(oov_token='<unk>')
tok.fit_on_texts(train_text)
tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

#Tokenize
train_seqs = tok.texts_to_sequences(train_text)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, padding='post')

train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valid_ds = valid_ds.batch(BATCH_SIZE)

In [None]:
#Define and fit model
from tensorflow.keras import layers

embedding_dim = 256
VOCAB_SIZE = 10000

model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, embedding_dim),
    layers.LSTM(64, return_sequences=True),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam',
              loss='binary_crossentropy', 
              metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=10,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])


def predict_message(pred_text):
  input = tok.texts_to_sequences([pred_text])
  prediction = []
  prediction.append(model.predict(input)[0][0])
  if prediction[0]<0.5:
    prediction.append('ham')
  else:
    prediction.append('spam')
  return prediction

pred_text = "how are you doing today"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
