## Environment Setup


In [2]:
try:
  import os
  import json
  from pathlib import Path
  import torch
  import numpy as np
  from google.colab import files

  import evaluate
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import optuna
  import wandb

  from nlpcw.utils import get_dataset, load_model, find_file, load_tokenizer, show_random_elements, tokenize_dataset
except:
  # TODO: temporary hack till colab upgrades pyarrow version
  %pip install -q pyarrow
  %pip install -q git+https://github.com/cogniveon/nlpcw.git

In [3]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
%env TOKENIZERS_PARALLELISM true
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True
env: TOKENIZERS_PARALLELISM=true


<IPython.core.display.Javascript object>

True

In [4]:
# @title Config
MODEL_NAME = "romainlhardy/roberta-large-finetuned-ner" # @param ["romainlhardy/roberta-large-finetuned-ner","google-bert/bert-base-uncased","romainlhardy/roberta-base-finetuned-ner","google-bert/bert-large-uncased", "romainlhardy/finetuned-ner", "romainlhardy/bert-finetuned-ner", "pucpr/biobertpt-all"] {"allow-input":true}
CHECKPOINT_PATH = None # @param {type:"raw"}
BATCH_SIZE = 8 # @param {type:"integer"}
NUM_EPOCHS = 20 # @param {type:"integer"}
LEARNING_RATE = 1e-6 # @param {type:"number"}
WEIGHT_DECAY = 0.001 # @param {type:"number"}
HPARAM_SEARCH = False # @param {type:"boolean"}

## Train

In [5]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
dataset

Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1072 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 153
    })
})

In [6]:
tokenizer = load_tokenizer(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
)

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [7]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [8]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

args = TrainingArguments(
    output_dir=MODEL_NAME.replace("/", "-") if CHECKPOINT_PATH == None else CHECKPOINT_PATH.replace("/", "-"),
    seed=42,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    save_total_limit=2,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to="wandb",
)


def model_init(trial):
    model, model_config = load_model(
        exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    return model


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = torch.nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    args=args,
    model_init=model_init,
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
if HPARAM_SEARCH:
  def optuna_hp_space(trial):
      return {
          "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
          "weight_decay": trial.suggest_float("weight_decay", 0.0001, 0.01),
          "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 12]),
          "num_train_epochs": trial.suggest_categorical("num_train_epochs", [10, 15, 20]),
      }

  best_trials = trainer.hyperparameter_search(
      backend="optuna",
      direction="minimize",
      hp_space=optuna_hp_space,
      n_trials=10,
  )
else:
  trainer.train()

trainer.evaluate()
wandb.finish()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4792,0.353242,0.850807,0.885608,0.867859,0.857413
2,0.3211,0.248332,0.922018,0.909029,0.915477,0.911867
3,0.3012,0.220597,0.931319,0.92057,0.925913,0.922254
4,0.2651,0.210965,0.937962,0.926341,0.932115,0.927762
5,0.2491,0.21045,0.941744,0.927359,0.934496,0.928864
6,0.2255,0.205006,0.942347,0.932111,0.937201,0.932641
7,0.2185,0.203423,0.942921,0.933639,0.938257,0.933743
8,0.2306,0.200241,0.946493,0.936694,0.941568,0.936261
9,0.226,0.199685,0.945784,0.938561,0.942159,0.93689
10,0.1965,0.201078,0.946117,0.93873,0.942409,0.93752


{'eval_loss': 0.20984238386154175,
 'eval_precision': 0.948358413132695,
 'eval_recall': 0.9412763068567549,
 'eval_f1': 0.9448040885860306,
 'eval_accuracy': 0.9392508655964746,
 'eval_runtime': 1.4797,
 'eval_samples_per_second': 85.151,
 'eval_steps_per_second': 10.813,
 'epoch': 20.0}

In [None]:
file_path = find_file(f"results/run-{best_trials.run_id}", "trainer_state.json")
if file_path:
  metrics = json.load(open(file_path, 'r'))
else:
  print("Metrics not found.")

In [None]:
print(f"Best trial hyperparameters: {json.dumps(best_trials.hyperparameters, indent=2)}")
print(f"Best trial final F1: {metrics['best_metric']:.3}")

Best trial hyperparameters: {
  "learning_rate": 3.1704250757279784e-06,
  "per_device_train_batch_size": 8,
  "num_train_epochs": 10
}
Best trial final F1: 0.932


In [None]:
# @title Download optuna results
zipfile_name = "google-bert_bert-large-uncased_optuna_runs" # @param {"type":"string"}
!zip -rq {zipfile_name}.zip results -x "*.pt" "*.safetensors" "*.bin" "*.pth"
files.download(f"{zipfile_name}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>