# Perceptron SMS Spam Filter

Clark - Whitehead




In [340]:
# These are bash commands (not python) that download and unzip the data.
!wget archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip

--2020-11-11 09:55:38--  http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip.1’


2020-11-11 09:55:38 (1.20 MB/s) - ‘smsspamcollection.zip.1’ saved [203415/203415]

Archive:  smsspamcollection.zip
replace SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace readme? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [341]:
import numpy as np
import sys
import re  # Regular Expressions library
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def sanitize(sms: str):
  '''
  Sanitize a sms string by removing weird characters and changing to lowercase.
  Input: string
  Return: sanitized string
  '''
  sms = re.sub('[^A-Za-z ]', '', sms) # Substitute weird characters with empty str.
  sms = sms.lower() # Force lowercase.
  return sms

def load_data():
  ''' 
  Reads the SMS dataset, converts to two-class bag-of-words representation.
  Returns:
    X: Numpy matrix entry i,j is the number of times word j appears in sms i.
    Y: Numpy vector entry i is 1 if sms i is spam, otherwise 0.
  '''
  # Read csv file.
  filename = 'SMSSpamCollection' # This file must exist in local directory.
  data = []
  for line in open(filename, 'r').readlines():
    line = line.split('\t') # Split two columns separated by tab.
    label = 1 if line[0]=='spam' else 0 # First column is 'ham' or 'spam'
    sms = line[1] # Second column is sms.
    sms = sanitize(sms)
    data.append((label, sms))
    
  # Supplement with some custom sms examples.
  data.append((0, sanitize('call me maybe? -carly')))
  data.append((1, sanitize('please call to you claim your prize')))

  # print(data)

  # Tokenize data.
  tokenizer = CountVectorizer(max_features=8603) # Set max number of tokens.
  Y = np.array([x[0] for x in data])
  corpus = [x[1] for x in data] # corpus is list of sms strings.
  X = tokenizer.fit_transform(corpus).toarray()
  vocabulary = tokenizer.vocabulary_
  return X,Y,vocabulary

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# Split train/test
X,y,vocabulary = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)
#X_test last index = 1840


## Class

1. Implements a perceptron class with a *train* method that takes four arguments: X, y, learning rate, and max_epochs.
2. Trains and tests a perceptron spam classifier on the dataset above. Reports the results in terms of accuracy and F1 score on the test set.




In [342]:
class perceptron:

  def train(self, x, y, lr, maxE):

    #x.shape[0] = 3735
    #x.shape[1] = 8603


    training_outputs = y

    synaptic_weights = np.random.random((x.shape[1], 1))

    b = 0

    for iteration in range(maxE):

      input_layer = x

      outputs = sigmoid(np.dot(input_layer, synaptic_weights) - b)

      error = outputs - training_outputs.reshape((3735, 1))

      # adjustments = error * sigmoid_derivative(outputs) Can use Sigmoid_deriv as learning rate

      adjustments = np.dot(input_layer.T, error * lr)

      synaptic_weights -= adjustments

      b -= sum(error * lr) / len(error)

    if outputs[outputs >= .5].shape == y[y != 0].shape:

      print("convereged")
      return synaptic_weights, b

    return synaptic_weights, b

  def test(self, x, y, w, b):


    tp, fp, tn, fn = 0, 0, 0, 0

    outputs = sigmoid(np.dot(x, w) - b)

    tp = np.sum(np.logical_and(outputs >= .5, y.reshape(1841, 1) == 1))
    fp = np.sum(np.logical_and(outputs >= .5, y.reshape(1841, 1) == 0))
    tn = np.sum(np.logical_and(outputs < .5, y.reshape(1841, 1) == 0))
    fn = np.sum(np.logical_and(outputs < .5, y.reshape(1841, 1) == 1))



    return tp, fp, tn, fn


aPerceptron = perceptron()

learning_rate = .005
max_ephocs = 100

w, b = aPerceptron.train(X_train, y_train, learning_rate, max_ephocs)

tp, fp, tn, fn = aPerceptron.test(X_test, y_test, w, b)

accuracy = (tp + tn) / (tp + tn + fp + fn)

fScore = tp / (tp + ((1/2)*(fp + fn)))

print("accuracy = ", accuracy)
print("fscore = ", fScore)


accuracy =  0.9369907658881043
fscore =  0.7867647058823529
