In [17]:
import pandas as pd
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
import random
import pickle
import os
import numpy as np

In [18]:
# from github
df_emails = pd.read_csv('https://raw.githubusercontent.com/Enrique1987/data/master/02_ML/03_spam.csv',sep=",", encoding='cp1252', usecols=["v1", "v2"])
df_emails = df_emails.rename(columns={'v1': 'label', 'v2': 'text'})

In [19]:
df_emails.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
# Create an empty model
nlp = spacy.blank("en")

# Add the TextCategorizer to the empty model
textcat = nlp.add_pipe("textcat")

In [21]:
values_label = list(set(df_emails["label"]))
print(values_label)

# Add labels to text classifier
for label in values_label:
    textcat.add_label(label)

['spam', 'ham']


In [22]:
train_texts = df_emails['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} for label in df_emails['label']]

Then we combine the texts and labels into a single list.

In [23]:
train_data = list(zip(train_texts, train_labels))

##### 4 - Train the Model

Now we are ready to train the model. First, create an optimizer using nlp.begin_training(). spaCy uses this optimizer to update the model. In general it's more efficient to train models in small batches. spaCy provides the minibatch function that returns a generator yielding minibatches for training. Finally, the minibatches are split into texts and labels, then used with nlp.update to update the model's parameters.

In [24]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training() # create optimizer

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

This is just one training loop (or epoch) through the data. The model will typically need multiple epochs. Use another loop for more epochs, and optionally re-shuffle the training data at the begining of each loop.

##### 5 - Make Predictions

In [25]:
texts = ["Hi remember to click her if you want to earn a lamborghini and to have 4000$ to enjoy your life ",
         "Hi Enrique, we are planing to have a meeting the people from the School when do you have time ?",
         "Do you want to be rich?, just clik here and see what happend, remember its for free, it would not cost you anything"]
docs = [nlp.tokenizer(text) for text in texts]
    
scores = textcat.predict(docs)
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['spam', 'ham', 'ham']


##### Own conclusions

The third message is clearly a spam message that you have not sorted properly. This is because the predictions are based on entirely new data that you have not learned from a similar example. 

This new test would have to be added, correctly labelled an traing agian.

In [29]:
print(type(train_texts))
print(type(train_labels))

<class 'numpy.ndarray'>
<class 'list'>


In [30]:
print(train_labels[0])

{'cats': {'ham': True, 'spam': False}}


In [32]:
new_labeled_Data = ["spam", "ham", "spam"]
for label in new_labeled_Data:
    train_labels.append({'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}})

In [28]:

for text in texts:
    np.append(train_texts, text)

##### 6 - Re training

In [33]:

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training() # create optimizer

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

##### 7 - Second try prediccionts text Classification

In [34]:
new_texts = ["Click the following link if you want to recive 8000$ cash now ",
             "Hi Juan, how are you ? do you kno something from Erika ?",
             "if you want to be rich, send be rich at the 999 0000"]
docs = [nlp.tokenizer(text) for text in new_texts]
    
scores = textcat.predict(docs)
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['spam', 'ham', 'spam']


##### 8 - Conclusions:
As we can see it has classified correctly after the process of relabelling new data with similar content.