# Import libraries, initialise variables

In [89]:
import os
import re

from collections import Counter

import nltk
import requests
import urlextract
# import libraries
# try:
#   # %tensorflow_version only exists in Colab.
#   !pip install tf-nightly
# except Exception:
#   pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
# !pip install tensorflow-datasets
# import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score

print(tf.__version__)

2.16.0-dev20231022


In [76]:
DATE = "asdf"

DATADIR = 'data'
if not os.path.exists(DATADIR):
    os.makedirs(DATADIR)

stemmer = nltk.PorterStemmer()

# Files download and read

In [66]:
urls = [
    "https://cdn.freecodecamp.org/project-data/sms/train-data.tsv",
    "https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv"
]

for i in urls:
    filename = os.path.basename(i)
    filename_dir = os.path.join( DATADIR, filename )
    if os.path.exists(filename_dir):
        print(f'[INFO] Path {filename_dir} already exists! Skipping download.')
    else:
        r = requests.get(i)
        with open(filename_dir, 'wb') as f:
            f.write(r.content)
        print(f'[INFO] Downloaded from web to path {filename_dir}')

train_file_path = "data/train-data.tsv"
test_file_path = "data/valid-data.tsv"

[INFO] Path data\train-data.tsv already exists! Skipping download.
[INFO] Path data\valid-data.tsv already exists! Skipping download.


In [67]:
def read_tsv(path):
    """
    # component 1
    
    reads tsv from the path
    """
    df = pd.read_csv(
        path, 
        sep = '\t', 
        # header=None
        names = ['label', 'text']
    )
    print(f"[INFO] Data points = {len(df)}")
    return df

df_train = read_tsv(train_file_path)
df_train


[INFO] Data points = 4179


Unnamed: 0,label,text
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...


# Data clean

In [68]:
df_train.isna().sum()
# no missing data points, so let's move on

label    0
text     0
dtype: int64

# Data preprocessing

In [69]:
df_train['text-proc'] = df_train['text']

In [70]:
### Replace multiple newline characters with one
df_train['text-proc'] = df_train['text-proc'].str.replace(r'\n+', '\n', regex=True)

df_train

Unnamed: 0,label,text,text-proc
0,ham,ahhhh...just woken up!had a bad dream about u ...,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ...","now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...,never y lei... i v lazy... got wat? dat day ü ...
...,...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...,-pls stop bootydelious (32/f) is inviting you ...


In [71]:
### Substitute URLs with "__URL__"
def replace_urls(string1):
    url_extractor = urlextract.URLExtract()
    url_list = url_extractor.find_urls(string1)
    for i in url_list:
        string1 = string1.replace(i, '__URL__')
    return string1

df_train['text-proc'] = df_train['text-proc'].apply( lambda x: replace_urls(x) )
### Replace all punctuation marks with whitespace
df_train['text-proc'] = df_train['text-proc'].apply( lambda x: re.sub(r'[^\w\s]', ' ', x) )

df_train

Unnamed: 0,label,text,text-proc
0,ham,ahhhh...just woken up!had a bad dream about u ...,ahhhh just woken up had a bad dream about u ...
1,ham,you can never do nothing,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ...",now u sound like manky scouse boy steve like ...
3,ham,mum say we wan to go then go... then she can s...,mum say we wan to go then go then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...,never y lei i v lazy got wat dat day ü ...
...,...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...,just woke up yeesh its late but i didn t fal...
4175,ham,what do u reckon as need 2 arrange transport i...,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...,free entry into our 250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...,pls stop bootydelious 32 f is inviting you ...


In [72]:
### Remove numbers
df_train['text-proc'] = df_train['text-proc'].apply( lambda x: re.sub(r'[0-9]+', '__NUMBER__', x) )
### Replace multiple whitespace characters with just one
df_train['text-proc'] = df_train['text-proc'].str.replace(r' +', ' ', regex=True)
### Convert to lowercase
df_train['text-proc'] = df_train['text-proc'].str.lower()
df_train

Unnamed: 0,label,text,text-proc
0,ham,ahhhh...just woken up!had a bad dream about u ...,ahhhh just woken up had a bad dream about u th...
1,ham,you can never do nothing,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ...",now u sound like manky scouse boy steve like i...
3,ham,mum say we wan to go then go... then she can s...,mum say we wan to go then go then she can shun...
4,ham,never y lei... i v lazy... got wat? dat day ü ...,never y lei i v lazy got wat dat day ü send me...
...,...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...,just woke up yeesh its late but i didn t fall ...
4175,ham,what do u reckon as need 2 arrange transport i...,what do u reckon as need __number__ arrange tr...
4176,spam,free entry into our £250 weekly competition ju...,free entry into our __number__ weekly competit...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...,pls stop bootydelious __number__ f is invitin...


In [84]:
train_labels = list(df_train['label'])

# Stemming
df_train_text = list(df_train['text-proc'])

data_processed = []
for i in df_train_text:
    i3 = i.split(' ')
    i3 = [stemmer.stem(i) for i in i3]
    word_counter = Counter(i3)
    word_counter = word_counter.most_common()
    data_processed.append(word_counter)

data_processed


[[('i', 3),
  ('up', 2),
  ('about', 2),
  ('u', 2),
  ('ahhhh', 1),
  ('just', 1),
  ('woken', 1),
  ('had', 1),
  ('a', 1),
  ('bad', 1),
  ('dream', 1),
  ('tho', 1),
  ('so', 1),
  ('dont', 1),
  ('like', 1),
  ('right', 1),
  ('now', 1),
  ('didnt', 1),
  ('know', 1),
  ('anyth', 1),
  ('comedi', 1),
  ('night', 1),
  ('but', 1),
  ('guess', 1),
  ('im', 1),
  ('for', 1),
  ('it', 1),
  ('', 1)],
 [('you', 1), ('can', 1), ('never', 1), ('do', 1), ('noth', 1)],
 [('u', 2),
  ('like', 2),
  ('now', 1),
  ('sound', 1),
  ('manki', 1),
  ('scous', 1),
  ('boy', 1),
  ('steve', 1),
  ('i', 1),
  ('is', 1),
  ('travel', 1),
  ('on', 1),
  ('da', 1),
  ('bu', 1),
  ('home', 1),
  ('wot', 1),
  ('ha', 1),
  ('inmind', 1),
  ('__number__', 1),
  ('recreat', 1),
  ('di', 1),
  ('eve', 1),
  ('', 1)],
 [('go', 2),
  ('then', 2),
  ('mum', 1),
  ('say', 1),
  ('we', 1),
  ('wan', 1),
  ('to', 1),
  ('she', 1),
  ('can', 1),
  ('shun', 1),
  ('bian', 1),
  ('watch', 1),
  ('da', 1),
  ('glass'

In [85]:
vocabulary = {}

for i in data_processed:
    for j in i:
        if j[0] not in vocabulary:
            vocabulary[j[0]] = j[1]
        else:
            vocabulary[j[0]] += j[1]

vocabulary = dict(sorted(vocabulary.items(), key=lambda item: item[1], reverse=True))
print(len(vocabulary))

def get_max_min(vocabulary):
    minKey, maxKey = min(vocabulary, key=vocabulary.get), max(vocabulary, key=vocabulary.get)
    print(f"""Max: "{maxKey}" = {vocabulary[maxKey]}; Min: "{minKey}" = {vocabulary[minKey]}""")
    print('-'*100)
get_max_min(vocabulary)


vocabulary_1000_keys = list(vocabulary.keys())[:1000]
vocabulary_1000 = {key:value for key, value in vocabulary.items() if key in vocabulary_1000_keys}
print(len(vocabulary_1000))
get_max_min(vocabulary_1000)
vocabulary_1000

5686
Max: "" = 2518; Min: "ahhhh" = 1
----------------------------------------------------------------------------------------------------
1000
Max: "" = 2518; Min: "manag" = 7
----------------------------------------------------------------------------------------------------


{'': 2518,
 '__number__': 2266,
 'i': 2233,
 'you': 1686,
 'to': 1681,
 'a': 1061,
 'the': 1007,
 'u': 865,
 'it': 742,
 'and': 722,
 'in': 684,
 'is': 663,
 'me': 595,
 'my': 575,
 'for': 542,
 'your': 525,
 'call': 500,
 'of': 475,
 'that': 458,
 's': 445,
 'have': 441,
 'on': 412,
 'do': 384,
 'now': 377,
 'are': 366,
 'go': 362,
 't': 359,
 'can': 357,
 'but': 348,
 'so': 343,
 'not': 339,
 'm': 335,
 'get': 333,
 'be': 326,
 'or': 322,
 'at': 319,
 'we': 306,
 'will': 303,
 'with': 294,
 'if': 293,
 'just': 289,
 'ur': 287,
 'no': 275,
 'thi': 269,
 'how': 243,
 'gt': 243,
 'lt': 242,
 'up': 235,
 'come': 235,
 'ok': 232,
 'what': 230,
 'free': 220,
 'when': 219,
 'out': 214,
 'day': 209,
 'all': 206,
 'know': 205,
 'from': 202,
 'like': 201,
 'time': 199,
 'll': 196,
 'love': 192,
 'then': 186,
 'good': 185,
 'got': 183,
 'he': 182,
 'want': 178,
 'there': 171,
 'text': 166,
 'onli': 163,
 'am': 163,
 'wa': 161,
 'send': 159,
 'hi': 154,
 '__url__': 152,
 'need': 149,
 'as': 143,

In [86]:
def process_data_per_vocab(data):
    output = []
    for i in data:
        int_vocab = { i:0 for i in list(vocabulary_1000.keys()) }
        for j in i:
            if j[0] in int_vocab:
                int_vocab[j[0]] += j[1]
        output.append(list(int_vocab.values()))
    return np.array(output)

data_processed2 = process_data_per_vocab(data_processed)
data_processed2.shape

(4179, 1000)

# Train

In [92]:
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, data_processed2, train_labels, cv=3, verbose=3)
print(score.mean())

log_clf.fit(data_processed2, train_labels)

[CV] END ................................ score: (test=0.985) total time=   0.1s
[CV] END ................................ score: (test=0.980) total time=   0.1s
[CV] END ................................ score: (test=0.983) total time=   0.1s
0.9825317061497967
