In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
print(tf.__version__)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

Data Preprocessing: First, you need to preprocess the data to prepare it for training the model. Load the data from the TSV files using pandas and split it into input features (messages) and target labels (ham/spam).

In [None]:
train_data = pd.read_csv('train-data.tsv', sep='\t')
test_data = pd.read_csv('valid-data.tsv', sep='\t')



train_messages = train_data.iloc[:, 1]  # Assuming the message column is at index 1
train_labels = train_data.iloc[:, 0]  # Assuming the label column is at index 0

test_messages = test_data.iloc[:, 1]  # Assuming the message column is at index 1
test_labels = test_data.iloc[:, 0]  # Assuming the label column is at index 0


Text Encoding: Convert the text messages into numerical representations that can be processed by the machine learning model. You can use the Tokenizer class from Keras to tokenize the messages and create word-to-index mappings.

In [None]:
num_words = 10000  # Adjust this value based on your data and vocabulary size

# Create a tokenizer and fit it on the training messages
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_messages)

# Convert the training and testing messages into sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_messages)
test_sequences = tokenizer.texts_to_sequences(test_messages)

Padding Sequences: Since messages can have varying lengths, you need to pad the sequences to make them of equal length. This is important for feeding the data into the model.

In [None]:
# Pad the sequences to a maximum length
max_length = 100  # choose an appropriate maximum length
train_sequences = pad_sequences(train_sequences, maxlen=max_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_length)

Model Architecture: Now, define the architecture of your machine learning model. You can use a combination of layers like Embedding, LSTM, and Dense to build a sequential model.

In [None]:
embedding_dim = 100  # Define the dimensionality of the embedding space

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
# Convert train_sequences to padded sequences
train_sequences = pad_sequences(train_sequences, maxlen=max_length)

# Convert train_labels to a numpy array
train_labels = np.array(train_labels)

Compile and Train the Model: Compile the model by specifying the loss function, optimizer, and evaluation metrics. Then, train the model on the training data.

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Encode the labels as numeric values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)

model.fit(train_sequences, train_labels, epochs=10, batch_size=64)

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(message):
    # Preprocess the message
    sequence = tokenizer.texts_to_sequences([message])
    sequence = pad_sequences(sequence, maxlen=max_length)

    # Make predictions
    probability = model.predict(sequence)[0][0]
    if probability < 0.5:
        label = 'ham'
    else:
        label = 'spam'

    return [probability, label]

In [None]:
message = "Hello, you've won a free vacation! Claim your prize now."
prediction = predict_message(message)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
