#Import Libraries

In [None]:
import pandas as pd

from spacy.util import minibatch
from spacy.training.example import Example

import random

# Load data

Data to be predicted on can be adapted to include spam detection, sentiment analysis, tagging customer queries, etc.

In [None]:
# Loading spam data
# ham is the label for non-spam messages

spam = pd.read_csv('/content/spam.csv', encoding="ISO-8859-1")

In [None]:
df = spam[['v1', 'v2']].copy()
df = df.rename(columns={'v1':'label', 'v2':'text'})
df.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Bag of Words
Need to convert the text to a numeric representation for training the model. We will use one-hot encoding.

Represent each document as a vector of term frequencies for each term in the vocabulary. The vocabulary is built from all the words in the collection of texts.

For each document, count up how many times a term occurs, and place that count in the appropriate element of a vector.

This is called the **bag of words** representation. You can see that documents with similar terms will have similar vectors. Vocabularies frequently have tens of thousands of terms, so these vectors can be very large.


# Building a Bag of Words model

In [None]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Add the TextCategorizer to the empty model
textcat = nlp.add_pipe("textcat")

"ham" are the real messages, "spam" are the spam messages

In [None]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

1

# Training a Text Categorizer Model

Convert labels in the data to the form TextCategorizer requires. For each document, create a dictionary of boolean values for each class.

For example, if a text is "ham",  need a dictionary `{'ham': True, 'spam': False}`. The model is looking for these labels inside another dictionary with the key `'cats'`.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [None]:
train_texts = train['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}}
                for label in train['label']]

In [None]:
test_texts = test['text'].values
test_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}}
                for label in test['label']]

combine texts and labels into a single list.

In [None]:
train_data = list(zip(train_texts, train_labels))
test_data = list(zip(test_texts, test_labels))
train_data[:3]

[('I am back. Bit long cos of accident on a30. Had to divert via wadebridge.I had a brilliant weekend thanks. Speak soon. Lots of love',
  {'cats': {'ham': True, 'spam': False}}),
 ('Send his number and give reply tomorrow morning for why you said that to him like that ok',
  {'cats': {'ham': True, 'spam': False}}),
 ('I plane to give on this month end.',
  {'cats': {'ham': True, 'spam': False}})]

In [None]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 112.99413429593315}
{'textcat': 158.52474269200502}
{'textcat': 193.5281401422562}
{'textcat': 212.13300989462863}
{'textcat': 221.92547100907927}
{'textcat': 239.61541233722835}
{'textcat': 246.24175934331896}
{'textcat': 257.8329995143858}
{'textcat': 265.7874583459801}
{'textcat': 269.3901511161076}


# Making Predictions

To make prediction input text needs to be tokenized with `nlp.tokenizer`. Pass tokens to predict method. Output is probability of either 'ham' or 'spam'.

In [None]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)

print(scores)

[[9.9988163e-01 1.1837344e-04]
 [3.0185128e-04 9.9969816e-01]]


In [None]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


In [None]:
# Testing set predictions
random.shuffle(test_data)
docs = [nlp.tokenizer(text) for text in test_texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
predictions = textcat.predict(docs)

In [None]:
predicted_labels = predictions.argmax(axis=1)

In [None]:
test_preds = [textcat.labels[label] for label in predicted_labels]

In [None]:
print(f"Majority class 'ham' makes up {(test['label'] == 'ham').sum() / len(test):.2%} of the dataset")

Majority class 'ham' makes up 85.29% of the dataset


In [None]:
print(f"Total accuracy on test set: {(test_preds == test['label']).sum() / len(test):.2%}")

Total accuracy on test set: 98.12%


In [None]:
print(f"Accuracy on minority class 'spam': {(test.loc[test_preds == test['label']]['label'] == 'spam').sum() / (test['label'] == 'spam').sum():.2%}")

Accuracy on minority class 'spam': 89.02%


# Next Steps

Sentiment Analysis

Transaction Analysis