In [1]:
import pandas as pd
data = pd.read_csv('E:/DataScience-Software/nlp-datasets/sms_spam.csv')
data.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
import spacy
#create an empty model
mynlp = spacy.blank("en")
#create a text classifier
textcat = mynlp.create_pipe("textcat", config ={"exclusive_classes": True, "architecture": "bow"})
#add the classifier to the empty model
mynlp.add_pipe(textcat)

textcat.add_label("ham")
textcat.add_label("spam")

#preparing the train data
train_texts = data['text'].values
train_labels = [{'cats':{'ham': t == 'ham', 'spam': t =='spam'}} for t in data['type']]

#combine the samples and lables in one list
train_data = list(zip(train_texts,train_labels))
train_data[:4]



[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}}),
 ('U dun say so early hor... U c already then say...',
  {'cats': {'ham': True, 'spam': False}})]

In [11]:
import random
from spacy.util import minibatch
random.seed(1)
optimiser = mynlp.begin_training()  # this is the trianer

losses ={}
for epoch in range(10):
    random.shuffle(train_data)
    batches = minibatch(train_data, 8)
    #iterate through minibatches
    for batch in batches:
        texts,labels = zip(*batch)
        mynlp.update(texts, labels, sgd = optimiser, losses = losses)
    print (losses)




{'textcat': 0.4162386108300211}
{'textcat': 0.624253188097498}
{'textcat': 0.7509113090248354}
{'textcat': 0.8367707935785589}
{'textcat': 0.8908664886556461}
{'textcat': 0.9246990838800522}
{'textcat': 0.9489346552956408}
{'textcat': 0.9657462924332322}
{'textcat': 0.9785745474880203}
{'textcat': 0.988229886503772}


In [18]:
#predict the labels for each new instance
#example = "Are you ready fro the tea party????? it's gonna be wild"
example = "URGENT REPLY to this message for GUARANTEED FREE GIFT"
tokenised_example = mynlp.tokenizer(example)
docs = [tokenised_example]
model = mynlp.get_pipe("textcat")
scores, _ = model.predict(docs)
print (scores)

[[0.03451201 0.965488  ]]


In [19]:
# from scores to labels
predicted_index = scores.argmax(axis=1)
print ([textcat.labels[ind] for ind in predicted_index])

['spam']
