In [5]:
import spacy
from spacy.util import minibatch, compounding
import random

# Blank English pipeline
nlp = spacy.blank("en")

# Text categorizer to the pipeline
textcat = nlp.add_pipe("textcat_multilabel", last=True)

# Labels (categories we want to predict)
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")


1

Test data

Each example has text + labels (POSITIVE or NEGATIVE).

"cats" = categories for the text. It has a special role in how spaCy stores category labels inside the training data for the textcat component.

Values 1 and 0 are for True/False.

In [6]:
train_data = [
    ("I absolutely loved this movie! The acting was brilliant and the story was emotional.", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    ("This film was boring and predictable, I didn’t enjoy it at all.", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    ("Amazing soundtrack and cinematography, a must-watch!", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    ("Terrible script and bad direction, complete waste of time.", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    ("It was okay, not great but not terrible either.", {"cats": {"POSITIVE": 0, "NEGATIVE": 0}}),
]


Train the Model

SpaCy’s minibatch and simple loop.

In [8]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example
import random

# Create blank English pipeline
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat_multilabel", last=True)
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Initialize optimizer
optimizer = nlp.begin_training()

# Training loop
for epoch in range(10):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(2.0, 4.0, 1.5))

    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)  # Create a Doc
            example = Example.from_dict(doc, annotations)
            examples.append(example)

    #The model learns from examples using nlp.update(). The loss value shows how well it’s learning (lower = better).
        nlp.update(examples, sgd=optimizer, losses=losses)
    
    print(f"Epoch {epoch+1} - Loss: {losses['textcat_multilabel']:.4f}")


Epoch 1 - Loss: 0.6759
Epoch 2 - Loss: 0.0190
Epoch 3 - Loss: 0.0044
Epoch 4 - Loss: 0.0003
Epoch 5 - Loss: 0.0003
Epoch 6 - Loss: 0.0004
Epoch 7 - Loss: 0.0003
Epoch 8 - Loss: 0.0002
Epoch 9 - Loss: 0.0002
Epoch 10 - Loss: 0.0001


Test the Model

In [9]:
test_reviews = [
    "What an incredible movie! The acting and direction were fantastic.",
    "The plot was weak and the characters were annoying.",
    "Visually stunning but emotionally flat.",
]

for review in test_reviews:
    doc = nlp(review)
    print(f"Review: {review}")
    print(f"Prediction: {doc.cats}")
    print("-" * 50)


Review: What an incredible movie! The acting and direction were fantastic.
Prediction: {'POSITIVE': 0.9969745874404907, 'NEGATIVE': 0.031578607857227325}
--------------------------------------------------
Review: The plot was weak and the characters were annoying.
Prediction: {'POSITIVE': 0.12783946096897125, 'NEGATIVE': 0.44591423869132996}
--------------------------------------------------
Review: Visually stunning but emotionally flat.
Prediction: {'POSITIVE': 0.4305601119995117, 'NEGATIVE': 0.9349267482757568}
--------------------------------------------------
