In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import re

In [None]:
dataset = load_dataset("stanfordnlp/imdb")

train_texts = dataset["train"]["text"]
train_labels = dataset["train"]["label"]

test_texts = dataset["test"]["text"]
test_labels = dataset["test"]["label"]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
def tokenize(text):
    text = text.lower()
    return re.findall(r"[a-z']+", text)

In [None]:
counter = Counter()
for txt in train_texts:
    counter.update(tokenize(txt))

vocab_size = 10000
vocab = {w: i+1 for i, (w, _) in enumerate(counter.most_common(vocab_size-1))}
UNK = 0

In [None]:
def encode(text):
    tokens = tokenize(text)
    ids = [vocab.get(t, UNK) for t in tokens]
    return torch.tensor(ids[:300])

In [None]:
X_train = [encode(t) for t in train_texts]
y_train = torch.tensor(train_labels)

X_test = [encode(t) for t in test_texts]
y_test = torch.tensor(test_labels)

In [None]:
def pad_batch(batch):
    lengths = [len(x) for x in batch]
    max_len = max(lengths)
    padded = torch.zeros(len(batch), max_len, dtype=torch.long)
    for i, seq in enumerate(batch):
        padded[i, :len(seq)] = seq
    return padded

In [None]:
class NeuralLR(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        emb = self.embed(x)
        pooled = emb.mean(dim=1)
        logits = self.linear(pooled)
        return logits

In [None]:
model = NeuralLR(vocab_size=vocab_size, embed_dim=100)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
batch_size = 32

def iterate_batches(X, y):
    idx = torch.randperm(len(X))
    for i in range(0, len(X), batch_size):
        batch_idx = idx[i:i+batch_size]
        batch_x = [X[j] for j in batch_idx]
        batch_y = y[batch_idx]
        yield pad_batch(batch_x), batch_y

In [None]:
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch_x, batch_y in iterate_batches(X_train, y_train):
        logits = model(batch_x)
        loss = criterion(logits, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss = {total_loss:.4f}")

Epoch 1 | Loss = 494.5332
Epoch 2 | Loss = 349.5374
Epoch 3 | Loss = 268.6618


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch)
        logits = model(padded)
        preds = logits.argmax(dim=1)
        labels = y_test[i:i+batch_size]
        correct += (preds == labels).sum().item()
        total += len(labels)

accuracy = correct / total
print(f"\nTest Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8557


In [None]:
model.eval()

for i in range(5):
    text = test_texts[i]
    encoded = encode(text).unsqueeze(0)
    padded = pad_batch([encoded.squeeze(0)])

    logits = model(padded)
    probs = torch.softmax(logits, dim=-1)
    pred = probs.argmax(dim=-1).item()

    print("TEXT:", text.replace("\n"," "), "...")
    print("LOGITS:", logits.detach().numpy())
    print("PROBS:", probs.detach().numpy())
    print("PRED:", pred, "(positive)" if pred==1 else "(negative)")

TEXT: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to