 Training the Spacy text classifier

In [4]:
#adding single-label text-categorizer to pipeline
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
import spacy

nlp = spacy.load("en_core_web_md")

config = {
    "threshold": 0.5,
    "model": DEFAULT_SINGLE_TEXTCAT_MODEL
}
textcat = nlp.add_pipe("textcat", config=config)
textcat

<spacy.pipeline.textcat.TextCategorizer at 0x20031b2d720>

In [7]:
#adding multi-labeltext categorizer to pipeline

from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
import spacy

nlp = spacy.load('en_core_web_md')

config = {
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL
}

textcat = nlp.add_pipe("textcat_multilabel", config=config)
textcat

<spacy.pipeline.textcat_multilabel.MultiLabel_TextCategorizer at 0x20040acf4a0>

 Formatting training data for the text categorizer

In [35]:
train_set = [
    ("I love this product, very easy to use.", 
     {"cats": {"semtiment": 1}}),
    ("I'll definately purchase again. I recommand this product.",
    {"cats": {"sentiment": 1}}),
    ("This is the best product ever. I loved the scent and the feel. Will buy again.",
    {"cats": {"sentiment": 1}}),
    ("Disappointed. This product didn't work for me at all.",
    {"cats": {"sentiment": 0}}),
    ("I hate the scent. Won't buy again",
    {"cats": {"sentiment": 0}}),
    ("Truly horrible product. Very few amount of product for a high price. Don't recommand.",
    {"cats": {"sentiment": 0}})
]

In [36]:
#import necessary
import random
import spacy
from spacy.training import Example
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL

nlp = spacy.load("en_core_web_md")
config = {
    "threshold": 0.5,
    "model": DEFAULT_SINGLE_TEXTCAT_MODEL
}

textcat = nlp.add_pipe('textcat', config=config)

In [37]:
#initialize 
textcat.add_label("sentiment")
train_examples = [Example.from_dict(nlp.make_doc(text), label)
                 for text, label in train_set]
textcat.initialize(lambda: train_examples, nlp = nlp)

 Defining training loop

In [38]:
epochs = 20
with nlp.select_pipes(enable="textcat"):
    optimizer = nlp.resume_training()
    for i in range(epochs):
        random.shuffle(train_examples)
        for example in train_examples:
            print(nlp.update([example], sgd=optimizer))

{'textcat': 0.25}
{'textcat': 0.23991741240024567}
{'textcat': 0.27470338344573975}
{'textcat': 0.2639513313770294}
{'textcat': 0.23380617797374725}
{'textcat': 0.24180001020431519}
{'textcat': 0.23025275766849518}
{'textcat': 0.2358265221118927}
{'textcat': 0.2511146068572998}
{'textcat': 0.18645083904266357}
{'textcat': 0.20686204731464386}
{'textcat': 0.26627978682518005}
{'textcat': 0.22527912259101868}
{'textcat': 0.16168811917304993}
{'textcat': 0.2226589322090149}
{'textcat': 0.09856107085943222}
{'textcat': 0.1643703579902649}
{'textcat': 0.1222798153758049}
{'textcat': 0.10860157012939453}
{'textcat': 0.13397707045078278}
{'textcat': 0.039275266230106354}
{'textcat': 0.06343962252140045}
{'textcat': 0.12224934995174408}
{'textcat': 0.09284649789333344}
{'textcat': 0.04253553971648216}
{'textcat': 0.053513091057538986}
{'textcat': 0.021047065034508705}
{'textcat': 0.038666825741529465}
{'textcat': 0.08046813309192657}
{'textcat': 0.004520265851169825}
{'textcat': 0.014356446452

 Testing the new component

In [39]:
doc2 = nlp("this product sucks")    
doc2.cats

{'sentiment': 0.6802538633346558, 'semtiment': 0.31974607706069946}

In [27]:
doc3 = nlp("this product is great")
doc3.cats

{'semtiment': 0.09758125990629196, 'sentiment': 0.9024186730384827}

In [28]:
doc4 = nlp("I hate this scent.")
doc4.cats

{'semtiment': 0.9808443784713745, 'sentiment': 0.01915563829243183}

In [29]:
doc5 = nlp("I love this product.")
doc5.cats

{'semtiment': 0.2958841323852539, 'sentiment': 0.7041159272193909}

 Training TextCategorizer for multilabel Classification

In [30]:
train_data = [
    ("It's the perfect movie for a Sunday evening.",
    {"cats": {"SUNDAY_EVENING": True}}),
    ("Very good thriller",
    {"cats": {"THRILLER": True}}),
    ("A great movie for the kids and all the family",
    {"cats": {"FAMILY": True, }}),
    ("An ideal movie for Sunday night with all the family. My kids loved the movie.",
    {"cats": {"FAMILY": True, "SUNDAY_EVENING": True}}),
    ("A perfect thriller for all the family. No violence, no drugs, pure action.",
    {"cats": {"FAMILY": True, "THRILLER": True}})
]

In [31]:
import random
import spacy
from spacy.training import Example
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

nlp = spacy.load("en_core_web_md")

config = {
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL
}
textcat = nlp.add_pipe("textcat_multilabel", config=config)

labels = ["FAMILY", "THRILLER", "SUNDAY_EVENING"]
for label in labels:
    textcat.add_label(label)

train_examples = [Example.from_dict(nlp.make_doc(text), label) 
                 for text, label in train_data]
textcat.initialize(lambda: train_examples, nlp=nlp)

In [32]:
epochs = 20
with nlp.select_pipes(enable="textcat_multilabel"):
    optimizer = nlp.resume_training()
    
    for i in range(epochs):
        random.shuffle(train_data)
        for text, label in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, label)
            print(nlp.update([example], sgd=optimizer))

{'textcat_multilabel': 0.6712846159934998}
{'textcat_multilabel': 0.019092481583356857}
{'textcat_multilabel': 0.6086833477020264}
{'textcat_multilabel': 0.45422717928886414}
{'textcat_multilabel': 0.008088719099760056}
{'textcat_multilabel': 2.6057587092509493e-05}
{'textcat_multilabel': 6.207443448147387e-10}
{'textcat_multilabel': 6.87805368215777e-12}
{'textcat_multilabel': 0.00021799537353217602}
{'textcat_multilabel': 1.618150963622611e-05}
{'textcat_multilabel': 2.4016344468691386e-10}
{'textcat_multilabel': 6.310978642432019e-05}
{'textcat_multilabel': 1.3793173820886295e-05}
{'textcat_multilabel': 1.1095360150648048e-06}
{'textcat_multilabel': 4.604316927725449e-12}
{'textcat_multilabel': 4.604316927725449e-12}
{'textcat_multilabel': 1.9218769011786208e-05}
{'textcat_multilabel': 6.128281029305072e-07}
{'textcat_multilabel': 1.6575540939811617e-10}
{'textcat_multilabel': 1.0602448128338438e-05}
{'textcat_multilabel': 4.811535063709016e-07}
{'textcat_multilabel': 9.851549293671

In [34]:
doc2 = nlp("Definately in my Sunday movie night list")
doc2.cats

{'FAMILY': 0.9949304461479187,
 'THRILLER': 0.4520221948623657,
 'SUNDAY_EVENING': 0.9664487838745117}