# Import libraries, initialise variables

In [25]:
import os
import re
import pickle
from collections import Counter

import nltk
import requests
import urlextract
# import libraries
# try:
#   # %tensorflow_version only exists in Colab.
#   !pip install tf-nightly
# except Exception:
#   pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
# !pip install tensorflow-datasets
# import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score

print(tf.__version__)

2.16.0-dev20231022


In [26]:
DATE = "asdf"

def create_dir_if_not_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

DATADIR = 'data'
create_dir_if_not_exists(DATADIR)

MODELDIR = 'models'
create_dir_if_not_exists(MODELDIR)

stemmer = nltk.PorterStemmer()

# Files download and read

In [17]:
urls = [
    "https://cdn.freecodecamp.org/project-data/sms/train-data.tsv",
    "https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv"
]

for i in urls:
    filename = os.path.basename(i)
    filename_dir = os.path.join( DATADIR, filename )
    if os.path.exists(filename_dir):
        print(f'[INFO] Path {filename_dir} already exists! Skipping download.')
    else:
        r = requests.get(i)
        with open(filename_dir, 'wb') as f:
            f.write(r.content)
        print(f'[INFO] Downloaded from web to path {filename_dir}')

train_file_path = "data/train-data.tsv"
test_file_path = "data/valid-data.tsv"

[INFO] Path data\train-data.tsv already exists! Skipping download.
[INFO] Path data\valid-data.tsv already exists! Skipping download.


In [18]:
def read_tsv(path):
    """
    # component 1
    
    reads tsv from the path
    """
    df = pd.read_csv(
        path, 
        sep = '\t', 
        # header=None
        names = ['label', 'text']
    )
    print(f"[INFO] Data points = {len(df)}")
    return df

df_train = read_tsv(train_file_path)
df_train


[INFO] Data points = 4179


Unnamed: 0,label,text
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...


In [19]:
train_labels, train_features = list(df_train['label']), list(df_train['text'])
print(len(train_labels), train_labels[:5])
print(len(train_features), train_features[:5])

4179 ['ham', 'ham', 'ham', 'ham', 'ham']
4179 ['ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.', 'you can never do nothing', 'now u sound like manky scouse boy steve,like! i is travelling on da bus home.wot has u inmind 4 recreation dis eve?', 'mum say we wan to go then go... then she can shun bian watch da glass exhibition...', 'never y lei... i v lazy... got wat? dat day ü send me da url cant work one...']


# Data clean

In [20]:
df_train.isna().sum()
# no missing data points, so let's move on

label    0
text     0
dtype: int64

# Data preprocessing

In [22]:
def preprocess(input_data: list) -> list:
    """
    Pipeline component 1
    
    process list of texts to a list of lists with stem counts.
    """
    data_processed = []
    ### initialise 'url_extractor' object
    url_extractor = urlextract.URLExtract()
    for i in input_data:
        ### Replace links with "__url__"
        url_list = url_extractor.find_urls(i)
        for j in url_list:
            i = i.replace(j, '__url__')
        ### Replace all punctuation marks with whitespace
        i = re.sub(r'[^\w\s]', ' ', i)
        ### Replace all newlines and tabs with whitespace
        i = re.sub(r'[\n]+', ' ', i)
        i = re.sub(r'[\t]+', ' ', i)
        ### Convert to lowercase
        i = i.lower()
        ### Remove numbers
        # i = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', '_number_', i)
        i = re.sub(r' ([0-9]+) ', '__number__', i)
        ### Stemming
        i2 = i.split(' ')
        i2 = [stemmer.stem(j) for j in i2]
        ### Word counter
        word_counter = Counter(i2)
        word_counter = word_counter.most_common()
        ### Append the result
        data_processed.append(word_counter)
    return data_processed

# input = ['line1\nline2\n\n\tline3.,69,']
train_features2 = preprocess(train_features)
train_features2


[[('', 6),
  ('i', 3),
  ('up', 2),
  ('about', 2),
  ('u', 2),
  ('ahhhh', 1),
  ('just', 1),
  ('woken', 1),
  ('had', 1),
  ('a', 1),
  ('bad', 1),
  ('dream', 1),
  ('tho', 1),
  ('so', 1),
  ('dont', 1),
  ('like', 1),
  ('right', 1),
  ('now', 1),
  ('didnt', 1),
  ('know', 1),
  ('anyth', 1),
  ('comedi', 1),
  ('night', 1),
  ('but', 1),
  ('guess', 1),
  ('im', 1),
  ('for', 1),
  ('it', 1)],
 [('you', 1), ('can', 1), ('never', 1), ('do', 1), ('noth', 1)],
 [('u', 2),
  ('like', 2),
  ('', 2),
  ('now', 1),
  ('sound', 1),
  ('manki', 1),
  ('scous', 1),
  ('boy', 1),
  ('steve', 1),
  ('i', 1),
  ('is', 1),
  ('travel', 1),
  ('on', 1),
  ('da', 1),
  ('bu', 1),
  ('home', 1),
  ('wot', 1),
  ('ha', 1),
  ('inmind__number__recr', 1),
  ('di', 1),
  ('eve', 1)],
 [('', 6),
  ('go', 2),
  ('then', 2),
  ('mum', 1),
  ('say', 1),
  ('we', 1),
  ('wan', 1),
  ('to', 1),
  ('she', 1),
  ('can', 1),
  ('shun', 1),
  ('bian', 1),
  ('watch', 1),
  ('da', 1),
  ('glass', 1),
  ('exhi

In [27]:
def construct_vocabulary_1000(data_processed: list) -> dict:
    vocabulary = {}

    for i in data_processed:
        for j in i:
            if j[0] not in vocabulary:
                vocabulary[j[0]] = j[1]
            else:
                vocabulary[j[0]] += j[1]

    vocabulary = dict(sorted(vocabulary.items(), key=lambda item: item[1], reverse=True))
    print(len(vocabulary))

    def get_max_min(vocabulary):
        minKey, maxKey = min(vocabulary, key=vocabulary.get), max(vocabulary, key=vocabulary.get)
        print(f"""Max: "{maxKey}" = {vocabulary[maxKey]}; Min: "{minKey}" = {vocabulary[minKey]}""")
        print('-'*100)
    get_max_min(vocabulary)


    vocabulary_1000_keys = list(vocabulary.keys())[:1000]
    vocabulary_1000 = {key:value for key, value in vocabulary.items() if key in vocabulary_1000_keys}
    print(len(vocabulary_1000))
    get_max_min(vocabulary_1000)
    vocabulary_1000
    return vocabulary_1000

vocabulary = construct_vocabulary_1000(train_features2)
vocabulary
with open(f'{MODELDIR}/manual-bow_vocabulary-dict.pkl', 'wb') as file:
    pickle.dump(vocabulary, file)



6664
Max: "" = 14645; Min: "ahhhh" = 1
----------------------------------------------------------------------------------------------------
1000
Max: "" = 14645; Min: "silent" = 7
----------------------------------------------------------------------------------------------------


In [33]:
def process_data_per_vocab(vocabulary_path: str, data: list) -> np.array:
    """
    Pipeline component 2
    
    Creates multi-dimensional vector BoW as per specified vocabulary from a list of lists
    """
    with open(vocabulary_path, 'rb') as file:
        vocabulary = pickle.load(file)
    output = []
    for i in data:
        int_vocab = { i:0 for i in list(vocabulary.keys()) }
        for j in i:
            if j[0] in int_vocab:
                int_vocab[j[0]] += j[1]
        output.append(list(int_vocab.values()))
    return np.array(output)

train_features3 = process_data_per_vocab('models/manual-bow_vocabulary-dict.pkl', train_features2)
train_features3.shape


(4179, 1000)

In [37]:
def pipeline(vocabulary_path: str, input_data: list[str]):
    """
    Full pipeline = component 1 + component 2
    """
    input_data2 = preprocess(input_data)
    output_data = process_data_per_vocab(vocabulary_path, input_data2)
    return output_data

df_train = read_tsv(train_file_path)
train_labels, train_features_raw = list(df_train['label']), list(df_train['text'])
train_features = pipeline('models/manual-bow_vocabulary-dict.pkl', train_features_raw)


[INFO] Data points = 4179


# Train

In [38]:
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, train_features, train_labels, cv=3, verbose=3)
print(score.mean())

log_clf.fit(train_features, train_labels)


[CV] END ................................ score: (test=0.978) total time=   0.3s
[CV] END ................................ score: (test=0.978) total time=   0.3s
[CV] END ................................ score: (test=0.982) total time=   0.3s
0.9791816223977028


# Test (test set)

In [59]:
df_test = read_tsv(test_file_path)
test_labels, test_features_raw = list(df_test['label']), list(df_test['text'])
test_features = pipeline('models/manual-bow_vocabulary-dict.pkl', test_features_raw)


[INFO] Data points = 1392


In [80]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

predictions = list(log_clf.predict(test_features))
print( accuracy_score(test_labels, predictions) )

predictions = list(log_clf.predict(test_features))
print( f1_score(predictions, test_labels, pos_label='spam') )


0.9827586206896551
0.9333333333333333


# Test (freecodecamp)

In [84]:
def predict_message(text):
    """
    function to predict messages based on model

    (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
    """
    prediction = []
    features = pipeline('models/manual-bow_vocabulary-dict.pkl', [text])
    ### Get class probability
    model = log_clf
    probabilities = model.predict_proba(features)
    max_index = np.argmax(probabilities)
    prediction.append(probabilities[0][max_index])
    ### Get predicted class
    pred_class = model.predict(features)
    prediction.append(pred_class[0])
    return prediction

pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)

pred_text = "You have just won a lottery!!! Collect your prize now"
prediction = predict_message(pred_text)
print(prediction)


[0.9864917926891434, 'ham']
[0.8670216570022804, 'spam']


In [85]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
