In [5]:
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import torch


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 1)

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
# outputs = model(**inputs, labels=labels)
# loss = outputs.loss
# logits = outputs.logits

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
dataset = load_dataset('json', data_files={'train': 'train.jsonl', 'test': 'test.jsonl'})


Using custom data configuration default-cdd89e05d2cdb522


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-cdd89e05d2cdb522/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-cdd89e05d2cdb522/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
def preprocess_data(examples):
    # encode a batch of sentences
    encoding = tokenizer(examples["sentence1"], padding="max_length", truncation=True)
    # add labels as a list
    encoding["labels"] = float(examples["label"])
    #encoding['position_ids'] = examples['idx']
    
    return encoding
data_collator = DataCollatorWithPadding(tokenizer)

# tokenize sentences + add labels
encoded_dataset = dataset.map(preprocess_data)
encoded_dataset = encoded_dataset.remove_columns(['word','sentence1','sentence2', 'idx', 'label', 'start1', 'start2', 'end1', 'end2', 'version'])
for k,v in encoded_dataset.items():
     print(k, v.shape)
# turn into PyTorch dataset
encoded_dataset.set_format("torch")



small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = encoded_dataset["test"].shuffle(seed=42).select(range(500))


  0%|          | 0/5000 [00:00<?, ?ex/s]

  0%|          | 0/638 [00:00<?, ?ex/s]

train (5000, 4)
test (638, 4)


In [11]:
print(small_train_dataset)
print(small_train_dataset['input_ids'])

# for i in small_train_dataset['labels']: 
#     print(i)




Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 500
})
tensor([[  101,  2260,  2132,  ...,     0,     0,     0],
        [  101,  1996, 15114,  ...,     0,     0,     0],
        [  101,  7374,  1037,  ...,     0,     0,     0],
        ...,
        [  101,  2002,  2985,  ...,     0,     0,     0],
        [  101,  1037,  4187,  ...,     0,     0,     0],
        [  101,  1037,  6293,  ...,     0,     0,     0]])


In [8]:
torch.cuda.empty_cache()


In [None]:

training_args = TrainingArguments(
    output_dir=".",
    save_strategy = "epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=13,
    per_device_eval_batch_size=13,
    num_train_epochs=50
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=encoded_dataset['train'], eval_dataset=encoded_dataset['test'], data_collator = data_collator)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 5000
  Num Epochs = 50
  Instantaneous batch size per device = 13
  Total train batch size (w. parallel, distributed & accumulation) = 13
  Gradient Accumulation steps = 1
  Total optimization steps = 19250


Epoch,Training Loss,Validation Loss


In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()