In [None]:
"""Copy of fcc_sms_text_classification.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1bAMI4OKk6OSALvzq2Z5cJ1Rv1Vw66VQ_
"""

In [3]:
# import libraries
import pandas as pd
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline



In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 1.7 MB/s eta 0:00:05
   --- ------------------------------------ 0.8/8.7 MB 1.5 MB/s eta 0:00:06
   ---- ----------------------------------- 1.0/8.7 MB 1.5 MB/s eta 0:00:06
   ------ --------------------------------- 1.3/8.7 MB 1.4

In [5]:
import urllib.request

In [6]:
# get data files
urllib.request.urlretrieve("https://cdn.freecodecamp.org/project-data/sms/train-data.tsv", "train-data.tsv")
urllib.request.urlretrieve("https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv", "valid-data.tsv")

('valid-data.tsv', <http.client.HTTPMessage at 0x1dbcd2d7950>)

In [7]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [8]:
# Load the datasets
train_data = pd.read_csv(train_file_path, sep="\t", names=["label", "message"])
test_data = pd.read_csv(test_file_path, sep="\t", names=["label", "message"])

In [9]:
# Split features and labels
X_train = train_data["message"]
y_train = train_data["label"]

In [10]:
X_test = test_data["message"]
y_test = test_data["label"]

In [11]:
# Build a pipeline: TF-IDF + Naive Bayes
model = Pipeline([
    ("vectorizer", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_features=5000
    )),
    ("classifier", MultinomialNB(alpha=0.3))
])

In [12]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.3
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    # model.predict_proba returns [[P(ham), P(spam)]]
    probs = model.predict_proba([pred_text])[0]
    spam_prob = probs[1]  # probability it's spam
    label = "spam" if spam_prob > 0.5 else "ham"
    return [spam_prob, label]

In [14]:
# test it
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)

[np.float64(0.00468952552538409), 'ham']


In [15]:
pred_text = "WINNER!! You have won a $1000 Walmart gift card. Click here."
prediction = predict_message(pred_text)
print(prediction)

[np.float64(0.7929608092871687), 'spam']


In [16]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

In [17]:
test_predictions()

You passed the challenge. Great job!
