In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
!pip install sacremoses

In [1]:
import torch.cuda
import torch
from accelerate import Accelerator
from torch import nn

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# init accelerator
accelerator = Accelerator(device_placement=True, fp16=True, mixed_precision='fp16')
# device = accelerator.device

EPOCHS = 20
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 32 // BATCH_SIZE

In [2]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers.models.herbert.tokenization_herbert_fast import HerbertTokenizerFast
tokenizer = HerbertTokenizerFast.from_pretrained("allegro/herbert-base-cased")

raw_datasets = load_dataset("allegro/klej-polemo2-in")
target_mapper = {label: i for i, label in enumerate(raw_datasets['train'].unique('target'), 0)}

def tokenize_function(example):
    tokenized = tokenizer(example['sentence'], truncation=True)
    tokenized['labels'] = [target_mapper[target] for target in example['target']] 
    return tokenized

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'token_type_ids', 'target'])
tokenized_datasets = tokenized_datasets.with_format('torch')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


raw_datasets

Using custom data configuration allegro--klej-polemo2-in-1c4a0b2602b47937
Reusing dataset csv (/home/bartekkrzepkowski/.cache/huggingface/datasets/csv/allegro--klej-polemo2-in-1c4a0b2602b47937/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'target'],
        num_rows: 5783
    })
    test: Dataset({
        features: ['sentence', 'target'],
        num_rows: 722
    })
    validation: Dataset({
        features: ['sentence', 'target'],
        num_rows: 723
    })
})

In [None]:
from transformers import AutoModelForSequenceClassification
polemo_herbert = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", 
                                                                    num_labels=len(target_mapper))
# polemo_herbert

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", gradient_accumulation_steps=GRAD_ACCUM_STEPS,
                                  per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, num_train_epochs=3, fp16=True)

In [None]:
def get_acc(eval_preds):
    y_pred, y_true = eval_preds
    acc = (y_pred.argmax(axis=1) == y_true).mean()
    return {'accuracy': acc}

In [None]:
from transformers import Trainer

trainer = Trainer(
    polemo_herbert,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=get_acc,
)

In [None]:
old_collator = trainer.data_collator
trainer.data_collator = lambda data: dict(old_collator(data))

In [None]:
trainer.train()

In [None]:
accelerator.device