In [None]:
import transformers
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import TrainingArguments
from datasets import ClassLabel, Value
from transformers import DataCollatorWithPadding
import numpy as np
from datasets import load_metric
from datasets import load_dataset
from tqdm import tqdm

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
raw_datasets = load_dataset("quora")
raw_datasets

Using custom data configuration default
Reusing dataset quora (/home/tafseer/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate'],
        num_rows: 404290
    })
})

In [None]:
def tokenize_function(example):
    questions = example['questions']
    t1 = []
    t2 = []
    for t in questions:
        t1.append(t['text'][0])
        t2.append(t['text'][1])
    return tokenizer(t1, t2, truncation=True)

In [None]:
tokenized_datasets = raw_datasets['train'].map(tokenize_function, batched=True)
tokenized_datasets



  0%|          | 0/405 [00:00<?, ?ba/s]

Dataset({
    features: ['questions', 'is_duplicate', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 404290
})

In [None]:
new_features = tokenized_datasets.features.copy()
new_features["is_duplicate"] = ClassLabel(num_classes=2, names=['not_duplicate', 'duplicate'], names_file=None, id=None)
tokenized_datasets = tokenized_datasets.cast(new_features)
tokenized_datasets = tokenized_datasets.remove_columns('questions').rename_column('is_duplicate', 'labels')
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)
tokenized_datasets

Casting the dataset:   0%|          | 0/41 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 323432
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80858
    })
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
samples = tokenized_datasets['train'][:8]
samples = {k: v for k, v in samples.items()}
batch = data_collator(samples)
batch = batch.to(device)
{k: v.shape for k, v in batch.items()}

In [None]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments("./quora-saved-model", evaluation_strategy="epoch", save_strategy='epoch', 
                                  report_to='none', num_train_epochs=3, 
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 323432
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 30324


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2656,0.248419,0.89677,0.862567
2,0.1774,0.245728,0.909817,0.877351
3,0.103,0.300438,0.912043,0.880495


***** Running Evaluation *****
  Num examples = 80858
  Batch size = 32
Saving model checkpoint to ./quora-saved-model/checkpoint-10108
Configuration saved in ./quora-saved-model/checkpoint-10108/config.json
Model weights saved in ./quora-saved-model/checkpoint-10108/pytorch_model.bin
tokenizer config file saved in ./quora-saved-model/checkpoint-10108/tokenizer_config.json
Special tokens file saved in ./quora-saved-model/checkpoint-10108/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 80858
  Batch size = 32
Saving model checkpoint to ./quora-saved-model/checkpoint-20216
Configuration saved in ./quora-saved-model/checkpoint-20216/config.json
Model weights saved in ./quora-saved-model/checkpoint-20216/pytorch_model.bin
tokenizer config file saved in ./quora-saved-model/checkpoint-20216/tokenizer_config.json
Special tokens file saved in ./quora-saved-model/checkpoint-20216/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 80858
  Batch size =

TrainOutput(global_step=30324, training_loss=0.19583375603963038, metrics={'train_runtime': 2564.0055, 'train_samples_per_second': 378.43, 'train_steps_per_second': 11.827, 'total_flos': 3.400379857264704e+16, 'train_loss': 0.19583375603963038, 'epoch': 3.0})