In [1]:
import pandas as pd

splits = {
    "train": "plain_text/train-00000-of-00001.parquet",
    "test": "plain_text/test-00000-of-00001.parquet",
    "unsupervised": "plain_text/unsupervised-00000-of-00001.parquet",
}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = df.dropna(subset=["text", "label"])
print(df.shape)

from sklearn.model_selection import train_test_split


train_text, val_text, train_labels, val_labels = train_test_split(
    df["text"].to_list(), df["label"].to_list(), test_size=0.2, random_state=42
)

len(train_text), len(val_text)

(25000, 2)


(20000, 5000)

In [3]:
model_name = "distilbert-base-uncased"
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

token = tokenizer(
    "This movie is perfect",
    truncation=True,
    padding="max_length",
    max_length=128,
    return_tensors="pt",
)
print(token)

{'input_ids': tensor([[ 101, 2023, 3185, 2003, 3819,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [4]:
import torch

from torch.utils.data import Dataset, DataLoader


class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt",
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


train_dataset = IMDBDataset(train_text, train_labels, tokenizer)
val_dataset = IMDBDataset(val_text, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

len(train_dataset), len(val_dataset)


(20000, 5000)

In [None]:
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm.auto import tqdm

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)


num_epochs = 2
num_training_step = len(train_loader) * num_epochs
optimizer = AdamW(model.parameters(), lr=5e-5)


device = torch.device("cpu")

model.to(device)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_step,
)


progress_bar = tqdm(range(num_training_step))


def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items}
            outputs = model(**batch)
            preds = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]

            correct += (preds == labels).sum().item()
            total += labels.size(0)

        return correct / total if total > 0 else 0.0


for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    val_acc = evaluate(model, val_loader)

    print (f"Epoch {epoch + 1} / {num_epochs} - -- - Val_Accuracy: {val_acc}")


    

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/2500 [00:41<?, ?it/s]


In [None]:
def predict_text(text):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt",
        )
        encoding = {k: v.to(device) for k, v in encoding.items()}
        outputs = model(**encoding)
        preds_id = outputs.logits.argmax(dim=-1)
        return preds_id


example = "I hate this movie"

predict_text(text=example)


TypeError: 'method' object is not iterable