In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download("popular")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# pre-process data
train_dataset = pd.read_csv(train_file_path, sep='\t')
train_dataset.columns = ['type', 'message']
train_data, train_labels = train_dataset['message'], train_dataset['type']

test_dataset = pd.read_csv(test_file_path, sep='\t')
test_dataset.columns = ['type', 'message']
test_data, test_labels = test_dataset['message'], test_dataset['type']


In [None]:
# split words and create dictionary
spam_dict = {}
ham_dict = {}

def split_words(text):
  tokens = word_tokenize(text)

  chars = ['.', ',', '/', '?', '(', ')', '&', '*', '#', '..', '...', '-']
  for i, token in enumerate(tokens):
      if token in chars:
          del tokens[i]
  return tokens

for item in range(len(train_data)-1):
  text = train_data[item]
  lemmatized_tokens = split_words(text)

  if train_labels[item] == 'spam':
    for i, word in enumerate(lemmatized_tokens):
      spam_dict[word] = spam_dict.get(word, 0) + 1

  else:
    for i, word in enumerate(lemmatized_tokens):
      ham_dict[word] = ham_dict.get(word, 0) + 1

# for word, freq in spam_dict.items():
#     print(f"{word}: {freq}")

# for word, freq in ham_dict.items():
#     print(f"{word}: {freq}")


In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

def predict_message(pred_text):
  spam_freq = 0
  ham_freq = 0
  tokens = split_words(pred_text)

  for token in tokens:
    if token in spam_dict:
      spam_freq += spam_dict[token]
      if token not in ham_dict:
        spam_freq += spam_dict[token]*2
    if token in ham_dict:
      ham_freq += ham_dict[token]

  total_freq = ham_freq + spam_freq
  ham_freq = ham_freq/total_freq
  # print('final ham freq: ', ham_freq)
  spam_freq = spam_freq/total_freq
  # print('final spam freq: ', spam_freq)

  if spam_freq < 0.30:
    prediction = [ham_freq, 'ham']
  else:
    prediction = [spam_freq, 'spam']

  print(prediction)
  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
