<a href="https://colab.research.google.com/github/amita-kapoor/Invited_Talks/blob/master/UO/Artificial-Intelligence-Cloud-and-Edge-Implementations/Spam_classifier_code_explanation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Classifier

The Spam classifier classifies if a given text is spam or not.  

We use the [UCI ML SMSSpam dataset](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

In [1]:
# The modules needed to run the code
import argparse  # To read commandline argument and parse it
import gensim.downloader as api
import numpy as np
import os  # For file and directory handling
import shutil  # For file and directory handling
import tensorflow as tf

from sklearn.metrics import accuracy_score, confusion_matrix  #For measuring performance

In [2]:
# Some parameters
DATA_DIR = "data"   # Data directory to save embedding
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR, "E.npy")  # Numpy file containing word embeddings
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"  # Dataset URL from where data is downloaded
EMBEDDING_MODEL = "glove-wiki-gigaword-300"  # The gensim embedding model we will use
EMBEDDING_DIM = 300  # The embedding dimensions
NUM_CLASSES = 2  # The number of classes in output-- Spam or Ham
BATCH_SIZE = 128  # The batch size
NUM_EPOCHS = 3  # number of epochs for which model is to be trained


# data distribution is 4827 ham and 747 spam (total 5574), which 
# works out to approx 87% ham and 13% spam, so we take reciprocals
# and this works out to being each spam (1) item as being approximately
# 8 times as important as each ham (0) message.
CLASS_WEIGHTS = { 0: 1, 1: 8 }  # To take care of imbalance in classes

tf.random.set_seed(42)  # Set the seed for random number generation to be able to reproduce results. 

## Data processing

### Function download_and_read()
This function takes a url as an argument using TF `get_file` function dwnloads the data from the given url, extracts it from the zip file and place it in folder `datasets`.
#### tf.keras.utils.get_file()
```
tf.keras.utils.get_file(
    fname, origin, untar=False, md5_hash=None, file_hash=None,
    cache_subdir='datasets', hash_algorithm='auto', extract=False,
    archive_format='auto', cache_dir=None
)

```
You can learn more from the [docs](https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file).

In [3]:
## Some utility functions-- should be ideally placed in a different python file --> can be util.py

# Data downloading and data Processing



def download_and_read(url):
    """
    The function downloads the data from given url, splits it into Text and Labels
    Uses tf.keras.utils.get_file() function to download the data from url--> function 
    downloads the data from the given url, extracts it from the zip file and place it in folder "datasets" 
    with the name specified in the first argument.
    tf.keras.utils.get_file(
    fname, origin, untar=False, md5_hash=None, file_hash=None,
    cache_subdir='datasets', hash_algorithm='auto', extract=False,
    archive_format='auto', cache_dir=None)

    Arguments:
    url: The url link of the dataset in zip format

    Returns:
    Two lists containing texts and respective labels

    """
    local_file = url.split('/')[-1]  # split the file name (last string after '/') from url
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir=".")  #function to download the data from url to folder datasets with name given in local_file
    labels, texts = [], []
    local_file = os.path.join("datasets", "SMSSpamCollection")  # define the path of the file from which to read data: datasets/SMSSpamCollection
    with open(local_file, "r") as fin:
        for line in fin:
            label, text = line.strip().split('\t')  # The labels and text are in one line separated by tab space.
            labels.append(1 if label == "spam" else 0)
            texts.append(text)
    return texts, labels

## Embeddings
* Train your own embeddings - `scratch`
* Use existing pre-trained embeddings
 * Word2Vec
 * GloVe- Global vectors for word representation - `vectorizer`
* Fine Tune the pre-trained embeddings for your corpus

In [4]:
# We want to only consider embeddings for words that exist in our vocabulary. 
# So we make a smaller embedding matrix for each word in the vocabulary.
# Each row in the matrix corresponds to a word, and the row itself is the vector corresponding 
# to the embedding for the word.
# The function uses Gensim api to Download (if needed) dataset/model and load it to memory.

def build_embedding_matrix(sequences, word2idx, embedding_dim, 
        embedding_file):
    """
    The function reads the dict word2idx (word --> number) and written the corresponding
    word vector for each word as defined by the Embedding model

    Arguments:
    sequences: not needed, not used-- just there because to suport back support for TF1 book
    word2idx: Dictionary  containing words in the text and their respective idx as given by tokenizer.
    embedding_dim: The number of units for the embedding layer
    embedding_file: The data file in which embeddings will be store for future use.

    """
    if os.path.exists(embedding_file):  # Checks if the embedding file already exists- then it justs loads it in the memory
        E = np.load(embedding_file)
    else:  # Else it creates the embedding file using the model specified in EMBEDDING_MODEL
        vocab_size = len(word2idx)  # The vocabulary size is number of unique words in the text
        E = np.zeros((vocab_size, embedding_dim)) # Creates a variable to store embeddings
        word_vectors = api.load(EMBEDDING_MODEL)  # Get the embeddings from Gensim
        for word, idx in word2idx.items():
            try:
                E[idx] = word_vectors.word_vec(word)  # For each word it converts it to respective word vector and store in Embedding file
            except KeyError:   # word not in embedding
                pass
            # except IndexError: # UNKs are mapped to seq over VOCAB_SIZE as well as 1
            #     pass
        np.save(embedding_file, E)  # The embeddings are saved in a file for future reference
    return E

## Build Model

Embedding Layer --> Dropout  --> Convolutional Layer 1D --> Pooling Layer --> Dense (Classification)

In [6]:
# Next we build the model-- ideally model definition should also be in separate file --> model.py


# We define a class SpamClassifierModel which builds and uses a 1D CNN to classify SMS texts as SPAM or HAM
# Depending upon the mode selected- we either build the model from scratch, or use the word vectors given by Gensim API
# or finetune the word vectors given by Gensim API

class SpamClassifierModel(tf.keras.Model):  # The model is build using model API of Keras with tf.Keras.Model as the parent class. 
# The class inherits train, predict methods of the parent class.
    def __init__(self, vocab_sz, embed_sz, input_length,
            num_filters, kernel_sz, output_sz, 
            run_mode, embedding_weights, 
            **kwargs):
        super(SpamClassifierModel, self).__init__(**kwargs)
        if run_mode == "scratch":  # Choose the embedding layer scratch means the weights wil be traned from scratch
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                trainable=True)
        elif run_mode == "vectorizer":  # Vectorizer means we use the pre-trained weights--> Transfer Learning
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                weights=[embedding_weights],
                trainable=False)
        else:  # This is the fine tuning mode- we use pre-trained weights for the embedding layer and fine tune them. 
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                weights=[embedding_weights],
                trainable=True)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)  # Add droput layer to avoid overfotting. 
        self.conv = tf.keras.layers.Conv1D(filters=num_filters,  # Define the 1D convolutional layer 
            kernel_size=kernel_sz,
            activation="relu")
        self.pool = tf.keras.layers.GlobalMaxPooling1D()  # The pooling layer
        self.dense = tf.keras.layers.Dense(output_sz, 
            activation="softmax")  # And the last classifying layer consists of a fully connected Dense layer

    def call(self, x):  # This function performs forward pass in the model. 
        x = self.embedding(x)
        x = self.dropout(x)
        x = self.conv(x)
        x = self.pool(x)
        x = self.dense(x)
        return x

In [7]:
# The code below requires a folder to be created
!mkdir data

In [8]:
## Now we will use the functions and model defined above --> ideally they should be done in a separate file-- main.py

# read data
texts, labels = download_and_read(DATASET_URL)

# tokenize and pad text so that each text is of same size
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
num_records = len(text_sequences)
max_seqlen = len(text_sequences[0])
print("{:d} sentences, max length: {:d}".format(num_records, max_seqlen))

# labels --> convert labels to categorical labels (one hot encoded)
cat_labels = tf.keras.utils.to_categorical(labels, num_classes=NUM_CLASSES)

# vocabulary --> Create word mapping and its inverse
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
word2idx["PAD"] = 0
idx2word[0] = "PAD"
vocab_size = len(word2idx)
print("vocab size: {:d}".format(vocab_size))

# load the dataset as tensors, split it into test, train and validation set
dataset = tf.data.Dataset.from_tensor_slices((text_sequences, cat_labels))
dataset = dataset.shuffle(10000)
test_size = num_records // 4
val_size = (num_records - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

# Build the embedding
E = build_embedding_matrix(text_sequences, word2idx, EMBEDDING_DIM,
    EMBEDDING_NUMPY_FILE)
print("Embedding matrix:", E.shape)


Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
5574 sentences, max length: 189
vocab size: 9010


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Embedding matrix: (9010, 300)


In [9]:
#Since we are not passing the mode by command line in this file we need to give a value to run_mode
run_mode = 'scratch'

In [10]:
# Now we use the SpamClassifierModel class to create a model
conv_num_filters = 256
conv_kernel_size = 3
model = SpamClassifierModel(
    vocab_size, EMBEDDING_DIM, max_seqlen, 
    conv_num_filters, conv_kernel_size, NUM_CLASSES,
    run_mode, E)
model.build(input_shape=(None, max_seqlen))
model.summary()

Model: "spam_classifier_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  2703000   
_________________________________________________________________
spatial_dropout1d (SpatialDr multiple                  0         
_________________________________________________________________
conv1d (Conv1D)              multiple                  230656    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  514       
Total params: 2,934,170
Trainable params: 2,934,170
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Define  compile and train
model.compile(optimizer="adam", loss="categorical_crossentropy",
    metrics=["accuracy"])

In [12]:
# Now we train the model
model.fit(train_dataset, epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    class_weight=CLASS_WEIGHTS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f09a7b26c18>

In [13]:
# Lastly we evaluate the trained model against test set
labels, predictions = [], []
for Xtest, Ytest in test_dataset:  
    Ytest_ = model.predict_on_batch(Xtest)  # for each test test predict the label
    ytest = np.argmax(Ytest, axis=1)  # Get the label with highest probabilty from actual test output
    ytest_ = np.argmax(Ytest_, axis=1) # Get the label with highest probabilty from predictted test output
    labels.extend(ytest.tolist())  # add to list
    predictions.extend(ytest.tolist())  # add to list

print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))  # Calculate accuracy score
print("confusion matrix")
print(confusion_matrix(labels, predictions))  # Calculate confusion matrix.

test accuracy: 1.000
confusion matrix
[[1091    0]
 [   0  189]]
