In [1]:
import pandas as pd
import random

# Load CSV
df = pd.read_csv("Data/IMDB Dataset.csv")

# Only a subset for faster training 
df = df.sample(5000, random_state=42)

# Convert to spaCy format
train_data = []
for _, row in df.iterrows():
    sentiment = row["sentiment"].upper()  # POSITIVE or NEGATIVE
    cats = {"POSITIVE": 1 if sentiment == "POSITIVE" else 0,
            "NEGATIVE": 1 if sentiment == "NEGATIVE" else 0}
    train_data.append((row["review"], {"cats": cats}))


In [2]:
from spacy.util import minibatch, compounding
from spacy.training.example import Example
import spacy, random

nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat_multilabel", last=True)
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

optimizer = nlp.begin_training()

for epoch in range(5):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(16.0, 64.0, 1.5))
    for batch in batches:
        examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in batch]
        nlp.update(examples, sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1} – Loss: {losses['textcat_multilabel']:.4f}")


Epoch 1 – Loss: 15.7947
Epoch 2 – Loss: 6.7640
Epoch 3 – Loss: 2.8964
Epoch 4 – Loss: 1.6835
Epoch 5 – Loss: 1.4595


In [5]:
texts = [
    """I saw 'One Battle After Another' last night, and I'm still not sure I get the hype. On paper, the film has everything: social commentary, action, radical politics, immigration, parenthood, even extremism. It tries to juggle so many weighty ideas at once - and I do applaud that - it's ambitious, topical, clearly trying to stir the pot. But ambition alone doesn't make for satisfying cinema, and this one ends up messy.
    At nearly three hours long, it's a slog when you still don't really know the people you're meant to care about. Not a single character feels grounded - Benicio del Toro aside (and even then, his screen time is frustratingly short). The rest make decisions that have zero logic in the absence of backstory, leaving them feeling exaggerated and hollow. So many talented actors, yet I wasn't rooting for anyone. 
    The "big message" moments - racism, political division, systemic power, identity - land far too on the nose, skimming the surface without ever cutting deep.
    If I'm being fair, one thing I did love was the soundtrack - tense, pulsing, and full of nervous energy, it injects more urgency than the script ever does. But pace and tone overall are uneven. PTA's usual offbeat humour is basically MIA, and without it, long stretches drag and feel oddly dull. The film indulges its own grandeur, with sprawling chases and spectacle, but without clarity or cohesion. Honestly? It's his weakest film yet. It wants to be grand, relevant, provocative - and in parts it succeeds - but just doesn't hold it together. In the end, there's no clear overall message, just a lot of battles, one after another, that feel thrown at you without being stitched into something coherent.""",
    """Given the cast & director I was expecting good things from this movie. Happy to report that my expectations were well and truly met. A movie with great performances throughout, containing the right level of humour. I've always rated Sean Penn as an actor and sometimes I think people forget how great he has been throughout his career. Leo has truly moved away from playing the "pretty boy" many movies ago and do I need to say how great Benicio outside time and time again. I'm now curious if the Academy will notice this movie next year. Definitely far more deserving than The Smashing Machine in my opinion."""
]
for text in texts:
    doc = nlp(text)
    print(doc.cats)


{'POSITIVE': 0.08038540184497833, 'NEGATIVE': 0.9601055383682251}
{'POSITIVE': 0.9933358430862427, 'NEGATIVE': 0.002513032639399171}
