In [1]:
!pip -q install transformers
!pip -q install datasets

# For hyperparam tuning
!pip -q install optuna
!pip -q install ray[tune]

[K     |████████████████████████████████| 2.3MB 7.8MB/s 
[K     |████████████████████████████████| 3.3MB 51.8MB/s 
[K     |████████████████████████████████| 901kB 47.5MB/s 
[K     |████████████████████████████████| 225kB 9.1MB/s 
[K     |████████████████████████████████| 112kB 52.8MB/s 
[K     |████████████████████████████████| 245kB 46.1MB/s 
[K     |████████████████████████████████| 296kB 7.5MB/s 
[K     |████████████████████████████████| 1.2MB 11.6MB/s 
[K     |████████████████████████████████| 81kB 8.9MB/s 
[K     |████████████████████████████████| 81kB 9.9MB/s 
[K     |████████████████████████████████| 51kB 8.4MB/s 
[K     |████████████████████████████████| 112kB 26.0MB/s 
[K     |████████████████████████████████| 143kB 21.7MB/s 
[?25h  Building wheel for alembic (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 49.7MB 62kB/s 
[K     |████████████████████████████████| 1.3MB 40.4M

# Intro

CoLA (Corpus of Linguistic Acceptability) : One of GLUE (General Language Understanding Evaluation) Benchmarking dataset, aimed to classify if a sentence is grammatically correct. Dataset consists of labels for supervised deep learning.

*More about GLUE Dataset : [link](https://gluebenchmark.com/tasks)*

# Config

In [2]:
DATASET_NAME = 'cola'   
NUM_LABELS = 2
METRIC_NAME = "matthews_correlation"
BATCH_SIZE = 16
PRETRAINED_MODEL_NAME = "distilbert-base-uncased"

# Import Modules

In [3]:
import random
import numpy as np
import pandas as pd
from collections import Counter

# from datasets import list_datasets, list_metrics      # to explore HuggingFace built-in datasets & metrics
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from IPython.display import display, HTML

# Load Dataset

In [4]:
dataset = load_dataset("glue", DATASET_NAME)
# metric = load_metric("glue", DATASET_NAME)
metric = load_metric(METRIC_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7777.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4473.0, style=ProgressStyle(description…


Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, post-processed: Unknown size, total: 964.86 KiB) to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376971.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1499.0, style=ProgressStyle(description…

Couldn't find file locally at matthews_correlation/matthews_correlation.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.6.2/metrics/matthews_correlation/matthews_correlation.py.
The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/metrics/matthews_correlation/matthews_correlation.py.





# Pre-Process Data

In [5]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, use_fast=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [6]:
# Test tokenizer
def test_tokenizer(s):
    tokens = tokenizer(s)
    print(tokens)

    decoded = []
    for t in tokens['input_ids']:
        decoded.append(tokenizer.decode(t))
    print(" ".join(decoded))


test_tokenizer("I am Alvin and I love Sashimi!")

{'input_ids': [101, 1045, 2572, 17348, 1998, 1045, 2293, 24511, 27605, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] i am alvin and i love sash ##imi ! [SEP]


In [7]:
for k in dataset.keys():
    print(dataset[k])
    print(Counter(dataset[k]['label']))

# Data is skewed towards "grammatically-correct" class (2.2x more than another)
# Stratifying classes might help with the result so that potential bias is cleared

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})
Counter({1: 6023, 0: 2528})
Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 1043
})
Counter({1: 721, 0: 322})
Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 1063
})
Counter({-1: 1063})


In [8]:
# use dataset (datasets.dataset_dict.DatasetDict) built-in method, i.e. map to add encoded tokens to each sentence
encoded_dataset = dataset.map(lambda s : tokenizer(s["sentence"], max_length=512, truncation=True), batched=True)       # add more s['...'] into tokenizer if sample has >1 sentence

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [9]:
# before-and-after encoding
print(dataset['train'][0].keys())
print(encoded_dataset['train'][0].keys())

dict_keys(['idx', 'label', 'sentence'])
dict_keys(['attention_mask', 'idx', 'input_ids', 'label', 'sentence'])


# Train Model

In [10]:
# make `model` callable to be initialized in Trainer for reproducibility
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

In [11]:
# Initialize Model; 
# IMPORTANT : re-initialize `model` before `trainer.train()` is called every time
# model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

args = TrainingArguments(
            "glue-model",                           # folder name to save model checkpoints
            evaluation_strategy = "epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            seed = 42,                              # fix random state
            metric_for_best_model=METRIC_NAME,      # evaluation to be done at the end of each epoch to choose the best model
        )


def compute_metrics(preds):
    predictions, labels = preds
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
            # model,
            model_init = model_init,        # initialize model in Trainer for reproducibility
            args=args,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5195,0.462983,0.21657
2,0.341,0.49934,0.047403
3,0.2355,0.582495,-0.083408


TrainOutput(global_step=1605, training_loss=0.3550419697509005, metrics={'train_runtime': 155.4495, 'train_samples_per_second': 10.325, 'total_flos': 0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 3358851072, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 13754368, 'train_mem_gpu_alloc_delta': 811548160, 'train_mem_cpu_peaked_delta': 1814528, 'train_mem_gpu_peaked_delta': 290513408})

# Evaluate

In [12]:
# Check which model (by epoch) was selected by the trainer
trainer.evaluate()

# trainer picked first epoch, which holds the highest Matthews Corr. (higher is better)
# Further epochs have less training loss but higher validation loss, which indicates over-fitting

{'epoch': 3.0,
 'eval_loss': 0.46298307180404663,
 'eval_matthews_correlation': 0.21656997961500624,
 'eval_mem_cpu_alloc_delta': -143360,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 20276736,
 'eval_runtime': 1.3312,
 'eval_samples_per_second': 783.522}

In [13]:
type(encoded_dataset["test"])

datasets.arrow_dataset.Dataset

In [14]:
# trainer.predict(encoded_dataset["test"])

# Hyperparam Tuning

In [15]:
# Chop train set to smaller sample size to speed up hyperparam searching process
train_dataset = encoded_dataset["train"].shard(index=1, num_shards=3)

In [16]:
# REMARK : re-initialize Trainer with `model_init` if `model` arg is used instead so that searches don't intefere one another
trainer_optimal = Trainer(
                        model_init = model_init,        # initialize model in Trainer for reproducibility
                        args=args,
                        train_dataset=train_dataset,
                        eval_dataset=encoded_dataset["validation"],
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics
                    )

# run 5 searches, aimed to maximise metric
best_run = trainer_optimal.hyperparameter_search(n_trials=8, direction="maximize")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.549873,0.036049


[32m[I 2021-05-15 13:38:50,557][0m Trial 0 finished with value: 0.03604886522537921 and parameters: {'learning_rate': 9.240497164069721e-05, 'num_train_epochs': 1, 'seed': 22, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassifi

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.614943,0.0
2,No log,0.601004,0.0
3,0.607000,0.590663,0.0
4,0.607000,0.586719,0.0



invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars

[32m[I 2021-05-15 13:40:13,905][0m Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 2.0220383062329964e-06, 'num_train_epochs': 4, 'seed': 29, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected i

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.637732,0.0
2,No log,0.614547,0.0
3,No log,0.609851,0.0
4,No log,0.608846,0.0



invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars

[32m[I 2021-05-15 13:41:16,766][0m Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 1.0917406623793932e-06, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected i

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.605875,0.0



invalid value encountered in double_scalars

[32m[I 2021-05-15 13:41:38,939][0m Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 5.728502656683331e-06, 'num_train_epochs': 1, 'seed': 10, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.6387,0.614062,0.0
2,0.6052,0.612606,0.0



invalid value encountered in double_scalars


invalid value encountered in double_scalars

[32m[I 2021-05-15 13:43:51,681][0m Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 1.0113962526847744e-06, 'num_train_epochs': 2, 'seed': 38, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model tha

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.582,0.537586,0.190967
2,0.481,0.71529,-0.065036
3,0.3639,0.840129,-0.071423


[32m[I 2021-05-15 13:47:08,833][0m Trial 5 finished with value: -0.07142313337282481 and parameters: {'learning_rate': 9.546098232452535e-06, 'num_train_epochs': 3, 'seed': 23, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.03604886522537921.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassifi

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.535023,0.166543
2,No log,0.548226,0.207783
3,No log,0.592669,0.20654
4,No log,0.623665,0.130128


[32m[I 2021-05-15 13:48:12,769][0m Trial 6 finished with value: 0.13012803274337575 and parameters: {'learning_rate': 3.481493150022609e-05, 'num_train_epochs': 4, 'seed': 23, 'per_device_train_batch_size': 64}. Best is trial 6 with value: 0.13012803274337575.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassifi

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.623567,0.0



invalid value encountered in double_scalars

[32m[I 2021-05-15 13:48:50,086][0m Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 1.1630234343396767e-06, 'num_train_epochs': 1, 'seed': 23, 'per_device_train_batch_size': 8}. Best is trial 6 with value: 0.13012803274337575.[0m


In [17]:
best_run

BestRun(run_id='6', objective=0.13012803274337575, hyperparameters={'learning_rate': 3.481493150022609e-05, 'num_train_epochs': 4, 'seed': 23, 'per_device_train_batch_size': 64})

# Retrain Model w/ Optimal Hyperparam

In [20]:
for param, v in best_run.hyperparameters.items():
    setattr(trainer_optimal.args, param, v)     # Python built-in "set attribute" function; (obj, attr, val)

trainer_optimal.train()

# result from best model is reproduced
# it is worse than our original model due to smaller training sample size.

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.535023,0.166543
2,No log,0.548226,0.207783
3,No log,0.592669,0.20654
4,No log,0.623665,0.130128


TrainOutput(global_step=180, training_loss=0.3871885935465495, metrics={'train_runtime': 60.4578, 'train_samples_per_second': 2.977, 'total_flos': 0, 'epoch': 4.0, 'train_mem_cpu_alloc_delta': -177569792, 'train_mem_gpu_alloc_delta': 3292160, 'train_mem_cpu_peaked_delta': 179171328, 'train_mem_gpu_peaked_delta': 724139520})