In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
raw_datasets = load_dataset('glue','mrpc')
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Reusing dataset glue (/home/jnavio/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 408
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 1725
    })
})

In [4]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation= True)

In [5]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/jnavio/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b2a91f7030334a11.arrow
100%|██████████| 1/1 [00:00<00:00,  8.14ba/s]
Loading cached processed dataset at /home/jnavio/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-309c2d85c8921379.arrow


In [6]:
data_collator = DataCollatorWithPadding(tokenizer)

In [7]:
# Trainer automatically remove columns and change name of columns by analyzing the models signature

In [8]:
# Last step before creating the trainer: Define our model and some hyperparameters
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels =2)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'test-trainer',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [10]:
# We can pass everything to the Trainer classs and start training
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer = tokenizer,
)
#trainer.train()


In [11]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2.
***** Running Prediction *****
  Num examples = 408
  Batch size = 16
100%|██████████| 26/26 [01:24<00:00,  3.38s/it](408, 2) (408,)


In [14]:
import numpy as np 
from datasets import load_metric

metric = load_metric('glue','mrpc')
preds = np.argmax(predictions.predictions, axis = -1)
metric.compute(predictions=preds, references=predictions.label_ids)


{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

In [15]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
training_args = TrainingArguments(
    'test-trainer',
    evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/jnavio/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
 

In [17]:
trainer = Trainer(
    model,
    training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2.
***** Running training *****
  Num examples = 3668
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1377

  0%|          | 0/1377 [00:00<?, ?it/s][A
  0%|          | 1/1377 [00:06<2:23:58,  6.28s/it][A
  0%|          | 2/1377 [00:11<2:18:11,  6.03s/it][A
  0%|          | 3/1377 [00:16<2:07:41,  5.58s/it][A
  0%|          | 4/1377 [00:21<2:08:26,  5.61s/it][A
  0%|          | 5/1377 [00:27<2:08:08,  5.60s/it][A
  0%|          | 6/1377 [00:32<2:01:05,  5.30s/it][A
  1%|          | 7/1377 [00:37<2:04:22,  5.45s/it][A
  1%|          | 8/1377 [00:43<2:05:15,  5.49s/it][A
  1%|          | 9/1377 [00:49<2:08:42,  5.65s/it][A
  1%|          | 10/1377 [00:56<2:17:29,  6.03s/

KeyboardInterrupt: 