In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from datasets import load_dataset
raw_datasets = load_dataset("imdb")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "google/electra-small-discriminator", 
    return_dict=False, 
    strict=False
)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

#tokenized_datasets = tokenized_datasets.rename_column("labels", "label")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

Reusing dataset imdb (/home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-0943cefd6a1cd5ad.arrow
Loading cached processed dataset at /home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-44ea5832b03319f6.arrow
Loading cached processed dataset at /home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-ec506dbd2460bd97.arrow
Loading cached shuffled indices for dataset at /home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-e8b1ab41c9d4aa41.arrow
Loading cached shuffled indices for dataset at /home/adamw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-2b1b375c74fd60b3.arrow


In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator", 
    num_labels=2,
    return_dict=False
)

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/adamw/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.11.3",
  "type_

In [13]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [14]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [15]:
import poptorch
import torch

model.electra.embeddings.position_embeddings = poptorch.BeginBlock(
    layer_to_call=model.electra.embeddings.position_embeddings,
    ipu_id=1
)

for index, layer in enumerate(model.electra.encoder.layer):
    layer = poptorch.BeginBlock(layer_to_call=layer, ipu_id=index+1)

opts = poptorch.Options().deviceIterations(8)

In [16]:
class WrappedModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, loss_inputs=None):
        output = self.model(input_ids, attention_mask)
        
        if loss_inputs is not None:
            loss = self.loss(output, loss_inputs)
            return output, loss
        
        return output

    def __getattr__(self, attr):
        try:
            return torch.nn.Module.__getattr__(self, attr)
        except AttributeError:
            return getattr(self.model, attr)


In [17]:
tm = poptorch.trainingModel(WrappedModel(model), options=opts)

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

from transformers import Trainer

trainer = Trainer(
    model=tm, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `PoplarExecutor.forward` and have been ignored: text, token_type_ids.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39


Error: In poptorch/source/RemoveSurplusIdentityLosses.cpp:103: 'poptorch_cpp_error': Couldn't find a loss in graph!