### The notebook contains the pipeline for solving the NLP classification task via transformer-based methods.

                                                                                          Created by: Tsvigun Akim

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import plotly.express as px

import transformers as ts
import datasets
import torch
from torch import nn

In [3]:
# Upload the data
data = datasets.load_dataset('glue', 'cola', cache_dir='tmp')
data

Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, post-processed: Unknown size, total: 964.86 KiB) to tmp/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/377k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to tmp/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [10]:
# Upload the tokenizer
tokenizer = ts.AutoTokenizer.from_pretrained('distilbert-base-uncased', cache_dir='tmp')
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [26]:
def tokenizing_fn(instance):
    return tokenizer(instance['sentence'], truncation=True)

tokenized_train = data['train'].map(tokenizing_fn)
tokenized_val = data['validation'].map(tokenizing_fn)
tokenized_train

  0%|          | 0/8551 [00:00<?, ?ex/s]

  0%|          | 0/1043 [00:00<?, ?ex/s]

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 1043
})

In [27]:
tokenized_train = tokenized_train.remove_columns(['sentence', 'idx'])
tokenized_val = tokenized_val.remove_columns(['sentence', 'idx'])
tokenized_train

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 8551
})

In [28]:
data_collator = ts.DataCollatorWithPadding(tokenizer = tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [32]:
accuracy_metric = datasets.load_metric('accuracy', cache_dir='tmp/acc_metrics')
cola_metric = datasets.load_metric('glue', 'cola', cache_dir='tmp/cola_metrics')

In [33]:
def compute_metrics(outputs):
    logits, labels = outputs
    preds = logits.argmax(1)

    metrics_dict = accuracy_metric.compute(
        references=labels, predictions=preds)
    metrics_dict.update(cola_metric.compute(
        references=labels, predictions=preds))

    return metrics_dict

In [36]:
model = ts.AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', cache_dir = 'tmp/model', num_labels = 2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

In [39]:
training_args = ts.TrainingArguments(
    output_dir='tmp/model_output',
    # Batch size args
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=150,
    # Optimizer args
    learning_rate=3e-5,
    weight_decay=1e-2,
    max_grad_norm=1.,
    # Scheduler args
    warmup_ratio=0.1,
    # Eval args
    metric_for_best_model='eval_accuracy',
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    # WANDB args
    report_to="wandb",  # enable logging to W&B
    run_name="run_name"  # name of the W&B run (optional)
)
callbacks = [ts.EarlyStoppingCallback(3)]

In [40]:
trainer = ts.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=callbacks,
    compute_metrics=compute_metrics
)

In [41]:
trainer.train()

***** Running training *****
  Num examples = 8551
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1340


Epoch,Training Loss,Validation Loss,Accuracy,Matthews Correlation
1,0.555,0.5049,0.767018,0.403801
2,0.3621,0.488262,0.801534,0.504599
3,0.2139,0.565243,0.807287,0.521952
4,0.1374,0.637477,0.813998,0.542906
5,0.0884,0.777554,0.817833,0.551796


***** Running Evaluation *****
  Num examples = 1043
  Batch size = 150
Saving model checkpoint to tmp/model_output/checkpoint-268
Configuration saved in tmp/model_output/checkpoint-268/config.json
Model weights saved in tmp/model_output/checkpoint-268/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 150
Saving model checkpoint to tmp/model_output/checkpoint-536
Configuration saved in tmp/model_output/checkpoint-536/config.json
Model weights saved in tmp/model_output/checkpoint-536/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 150
Saving model checkpoint to tmp/model_output/checkpoint-804
Configuration saved in tmp/model_output/checkpoint-804/config.json
Model weights saved in tmp/model_output/checkpoint-804/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 150
Saving model checkpoint to tmp/model_output/checkpoint-1072
Configuration saved in tmp/model_output/checkpoint-1072

TrainOutput(global_step=1340, training_loss=0.2713597311902402, metrics={'train_runtime': 242.8359, 'train_samples_per_second': 176.065, 'train_steps_per_second': 5.518, 'total_flos': 256606427072280.0, 'train_loss': 0.2713597311902402, 'epoch': 5.0})

In [42]:
predictions = trainer.predict(tokenized_val)
predictions

***** Running Prediction *****
  Num examples = 1043
  Batch size = 150


PredictionOutput(predictions=array([[-2.99295  ,  2.7972248],
       [-3.0615141,  2.922417 ],
       [-1.0658823,  1.0423502],
       ...,
       [-3.1941745,  2.996454 ],
       [-3.1349125,  2.9470425],
       [ 1.5396605, -1.673574 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.7775542736053467, 'test_accuracy': 0.8178331735378715, 'test_matthews_correlation': 0.5517964161621091, 'test_runtime': 1.802, 'test_samples_per_second': 578.788, 'test_steps_per_second': 3.884})