In [1]:
import tensorflow as tf
import os
import io

In [2]:
tf.__version__

'2.7.0'

In [3]:
# Download the zip file
path_to_zip_file = tf.keras.utils.get_file("smsspamcollection.zip",
origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
extract=True)

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip


In [4]:
# Unzip the file into a folder
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall("sms_spam_dataset")

In [5]:
os.listdir("sms_spam_dataset")

['readme', 'SMSSpamCollection']

In [6]:
# Let's see if we read the data correctly
lines = io.open('sms_spam_dataset/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [7]:
### seperate with tab  \t  and split into two columns label and text

In [8]:
spam_data = []
for line in lines :
    label,text = line.split("\t")
    if label.strip() == "spam":
        spam_data.append((1,text.strip()))
    else:
        spam_data.append((0,text.strip()))

In [9]:
## Check length
len(spam_data)

5574

In [10]:
## Text Normalization

In [11]:
import pandas as pd
df = pd.DataFrame(spam_data, columns=['spam', 'message'])

In [12]:
df.head()

Unnamed: 0,spam,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
import re
def message_length(x):
# returns total number of characters
    return len(x)

In [14]:
def num_capitals(x):
    ##function: substitutions are performed for the capital letters in
    ##English. The count of these substitutions provides the count of capital letters.
    # only works in english
    _, count = re.subn(r'[A-Z]', '', x)
    return count

In [15]:
def num_punctuation(x):
    _, count = re.subn(r'\W', '', x)
    return count

In [16]:
df['capitals'] = df['message'].apply(num_capitals)
df['punctuation'] = df['message'].apply(num_punctuation)
df['length'] = df['message'].apply(message_length)
df.describe()

Unnamed: 0,spam,capitals,punctuation,length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [17]:
train=df.sample(frac=0.8,random_state=42)
test=df.drop(train.index)

In [18]:
train.shape

(4459, 5)

In [19]:
x_train = train[['length', 'capitals', 'punctuation']]
y_train = train[['spam']]
x_test = test[['length', 'capitals', 'punctuation']]
y_test = test[['spam']]

In [20]:
## Modeling normalized data

In [21]:
def make_model(input_dims = 3,
              num_units=12):
    model = tf.keras.Sequential()
    
    ## Adding a densely connected layer with 12 units
    model.add(tf.keras.layers.Dense(num_units,
        input_dim=input_dims,activation="relu"))
    # Add a sigmoid layer with a binary output unit:
    model.add(tf.keras.layers.Dense(1,activation = "sigmoid"))
    model.compile(loss="binary_crossentropy",optimizer = "adam",
                 metrics = ["accuracy"])
    return model

In [22]:
model = make_model()

In [23]:
model.fit(x_train,y_train,epochs = 5,batch_size = 16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8240e613d0>

In [24]:
model.evaluate(x_test,y_test)



[0.34710294008255005, 0.8726457357406616]

In [27]:
import numpy as np

In [33]:
predict_x=model.predict(x_train) 
y_train_pred=np.argmax(predict_x,axis=1)

In [35]:
tf.math.confusion_matrix(tf.constant(np.array(y_train.spam)),y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3867,    0],
       [ 592,    0]], dtype=int32)>

In [36]:
!pip install stanfordnlp

Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[?25l[K     |██                              | 10 kB 16.1 MB/s eta 0:00:01[K     |████▏                           | 20 kB 9.7 MB/s eta 0:00:01[K     |██████▏                         | 30 kB 8.6 MB/s eta 0:00:01[K     |████████▎                       | 40 kB 7.6 MB/s eta 0:00:01[K     |██████████▎                     | 51 kB 5.7 MB/s eta 0:00:01[K     |████████████▍                   | 61 kB 5.7 MB/s eta 0:00:01[K     |██████████████▌                 | 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████▌               | 81 kB 6.3 MB/s eta 0:00:01[K     |██████████████████▋             | 92 kB 5.1 MB/s eta 0:00:01[K     |████████████████████▋           | 102 kB 4.4 MB/s eta 0:00:01[K     |██████████████████████▊         | 112 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████▊       | 122 kB 4.4 MB/s eta 0:00:01[K     |██████████████████████████▉     | 133 kB 4.4 MB/s eta 0:00:

In [37]:
import stanfordnlp as snlp
en = snlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.
Y

Downloading models for: en_ewt
Download location: Y/en_ewt_models.zip


100%|██████████| 235M/235M [00:41<00:00, 5.63MB/s]



Download complete.  Models saved to: Y/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [39]:
sentence = 'Go until Jurong point, crazy.. Available only in bugis n great world'
sentence.split()

['Go',
 'until',
 'Jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world']