## Environment Setup


In [3]:
try:
  import os
  import json
  from pathlib import Path
  import torch
  import numpy as np
  from google.colab import files

  import evaluate
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import optuna
  import wandb

  from nlpcw.utils import get_dataset, load_model, find_file, load_tokenizer, show_random_elements, tokenize_dataset
except:
  # TODO: temporary hack till colab upgrades pyarrow version
  %pip install -q pyarrow
  %pip install -q git+https://github.com/cogniveon/nlpcw.git

In [4]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
%env TOKENIZERS_PARALLELISM true
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True
env: TOKENIZERS_PARALLELISM=true


<IPython.core.display.Javascript object>

wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


True

In [5]:
# @title Config
MODEL_NAME = "romainlhardy/roberta-large-finetuned-ner" # @param ["romainlhardy/roberta-large-finetuned-ner","google-bert/bert-base-uncased", "google-bert/bert-large-uncased", "romainlhardy/finetuned-ner", "romainlhardy/bert-finetuned-ner", "pucpr/biobertpt-all"] {"allow-input":true}
CHECKPOINT_PATH = None # @param {type:"raw"}
BATCH_SIZE = 8 # @param {type:"integer"}
NUM_EPOCHS = 20 # @param {type:"integer"}
LEARNING_RATE = 1e-6 # @param {type:"number"}
WEIGHT_DECAY = 0.001 # @param {type:"number"}
HPARAM_SEARCH = True # @param {type:"boolean"}

## Train

In [6]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
dataset

Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1072 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 153
    })
})

In [7]:
tokenizer = load_tokenizer(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
)

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [8]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [9]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

args = TrainingArguments(
    output_dir=MODEL_NAME.replace("/", "-") if CHECKPOINT_PATH == None else CHECKPOINT_PATH.replace("/", "-"),
    seed=42,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    save_total_limit=2,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to="wandb",
)


def model_init(trial):
    model, model_config = load_model(
        exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    return model


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = torch.nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    args=args,
    model_init=model_init,
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
if HPARAM_SEARCH:
  def optuna_hp_space(trial):
      return {
          "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
          "weight_decay": trial.suggest_float("weight_decay", 0.0001, 0.01),
          "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 12]),
          "num_train_epochs": trial.suggest_categorical("num_train_epochs", [10, 15, 20]),
      }

  best_trials = trainer.hyperparameter_search(
      backend="optuna",
      direction="minimize",
      hp_space=optuna_hp_space,
      n_trials=10,
  )
else:
  trainer.train()

trainer.evaluate()
wandb.finish()

[I 2024-08-11 21:39:42,943] A new study created in memory with name: no-name-0022aa82-c071-40ad-a696-6e3c31c29b76
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2969,0.208048,0.932085,0.929396,0.930739,0.92729
2,0.2169,0.194331,0.949699,0.935675,0.942635,0.938464
3,0.2146,0.198522,0.946945,0.93907,0.942991,0.937834
4,0.1598,0.196777,0.951315,0.94518,0.948238,0.942398
5,0.1487,0.230524,0.953728,0.944501,0.949092,0.942713
6,0.1213,0.222926,0.952291,0.94518,0.948722,0.941926
7,0.1226,0.222748,0.950163,0.941616,0.94587,0.940195
8,0.1148,0.234263,0.950977,0.941616,0.946273,0.94051


[I 2024-08-11 21:44:24,393] Trial 0 finished with value: 3.7793759353364864 and parameters: {'learning_rate': 3.805984355680514e-06, 'weight_decay': 0.0031306772261814587, 'per_device_train_batch_size': 8, 'num_train_epochs': 20}. Best is trial 0 with value: 3.7793759353364864.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2904,0.220885,0.944694,0.927699,0.936119,0.929965
2,0.1884,0.189076,0.956779,0.931772,0.94411,0.938307
3,0.1298,0.164312,0.955533,0.948235,0.95187,0.947435
4,0.1149,0.185724,0.957087,0.950102,0.953581,0.946805
5,0.0637,0.202291,0.960069,0.950781,0.955402,0.949953
6,0.0539,0.209984,0.958605,0.947217,0.952877,0.947592
7,0.038,0.22513,0.963001,0.954175,0.958568,0.952471
8,0.0359,0.258309,0.953298,0.949253,0.951271,0.944917
9,0.0245,0.26703,0.957857,0.952817,0.955331,0.948536
10,0.0113,0.323089,0.96103,0.950102,0.955535,0.948694


[I 2024-08-11 21:51:12,561] Trial 1 finished with value: 3.8153603053223404 and parameters: {'learning_rate': 3.468094632528973e-05, 'weight_decay': 0.005490005039977476, 'per_device_train_batch_size': 12, 'num_train_epochs': 15}. Best is trial 0 with value: 3.7793759353364864.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2644,0.206,0.940902,0.924134,0.932443,0.92666
2,0.181,0.197845,0.951308,0.931772,0.941439,0.935159
3,0.1452,0.205599,0.941126,0.925153,0.933071,0.928549
4,0.1003,0.175776,0.956417,0.946029,0.951195,0.94712
5,0.0719,0.246652,0.955168,0.947386,0.951261,0.944917
6,0.0482,0.245347,0.955986,0.947386,0.951667,0.945389
7,0.0436,0.284038,0.953783,0.945689,0.949719,0.944602
8,0.0269,0.276754,0.959993,0.948914,0.954421,0.948536
9,0.0106,0.290993,0.958832,0.952648,0.95573,0.95011
10,0.0144,0.3151,0.959411,0.950781,0.955076,0.948851


[I 2024-08-11 21:58:14,600] Trial 2 finished with value: 3.8141189886309257 and parameters: {'learning_rate': 3.0610561053222713e-05, 'weight_decay': 0.007522214891963697, 'per_device_train_batch_size': 8, 'num_train_epochs': 10}. Best is trial 0 with value: 3.7793759353364864.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4679,0.346346,0.859257,0.886965,0.872891,0.864967
2,0.3147,0.243527,0.924698,0.90869,0.916624,0.912653
3,0.3003,0.218804,0.932029,0.921589,0.926779,0.923041
4,0.2679,0.20773,0.937468,0.926171,0.931785,0.92729
5,0.2515,0.208986,0.942483,0.928887,0.935636,0.930595
6,0.2192,0.20302,0.944349,0.93313,0.938706,0.933585
7,0.2145,0.20355,0.944102,0.934487,0.93927,0.935002
8,0.2301,0.199362,0.9469,0.938221,0.94254,0.937205
9,0.2166,0.201209,0.94699,0.939919,0.943441,0.937992
10,0.1967,0.202353,0.94946,0.940597,0.945008,0.939251


[I 2024-08-11 22:09:27,920] Trial 3 finished with value: 3.773132301056416 and parameters: {'learning_rate': 1.0509220562206556e-06, 'weight_decay': 0.004573106626696881, 'per_device_train_batch_size': 8, 'num_train_epochs': 20}. Best is trial 3 with value: 3.773132301056416.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2659,0.194086,0.946842,0.931093,0.938901,0.933585
2,0.1903,0.189416,0.955138,0.932281,0.943571,0.936733
3,0.1383,0.204106,0.936858,0.929226,0.933027,0.93028
4,0.1046,0.200528,0.958449,0.939579,0.94892,0.942084
5,0.0799,0.250823,0.949711,0.948744,0.949227,0.942398
6,0.0504,0.262928,0.948221,0.954175,0.951189,0.943028
7,0.0492,0.252358,0.95191,0.947386,0.949643,0.943658
8,0.0365,0.257005,0.957341,0.948405,0.952852,0.946648
9,0.0233,0.301064,0.961538,0.950441,0.955958,0.95011
10,0.0192,0.333133,0.955267,0.949593,0.952421,0.945703


[I 2024-08-11 22:17:16,629] Trial 4 finished with value: 3.8146304609392487 and parameters: {'learning_rate': 3.0320615555386866e-05, 'weight_decay': 0.00038417408314955954, 'per_device_train_batch_size': 8, 'num_train_epochs': 20}. Best is trial 3 with value: 3.773132301056416.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.256,0.193605,0.950551,0.936354,0.943399,0.937834


[I 2024-08-11 22:19:20,266] Trial 5 pruned. 
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4051,0.260404,0.908956,0.93873,0.923604,0.915329
2,0.3222,0.298377,0.898409,0.920061,0.909106,0.903368
3,0.2783,0.221685,0.935913,0.931942,0.933923,0.928549
4,0.2109,0.217361,0.938132,0.929056,0.933572,0.927447
5,0.1483,0.268192,0.939782,0.934997,0.937383,0.930123
6,0.142,0.214753,0.940415,0.937542,0.938977,0.931854
7,0.1112,0.29634,0.939533,0.941446,0.940488,0.934372
8,0.1416,0.312821,0.934684,0.939919,0.937294,0.929651
9,0.1149,0.30466,0.943396,0.941955,0.942675,0.93689
10,0.0576,0.319808,0.940122,0.943313,0.941715,0.935474


[I 2024-08-11 22:32:18,724] Trial 6 finished with value: 3.808147919246153 and parameters: {'learning_rate': 8.44157689792801e-05, 'weight_decay': 0.003370130491629562, 'per_device_train_batch_size': 4, 'num_train_epochs': 20}. Best is trial 3 with value: 3.773132301056416.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2914,0.219099,0.932578,0.922607,0.927566,0.92367
2,0.2444,0.206236,0.944263,0.931602,0.93789,0.93327
3,0.2795,0.195967,0.944216,0.936524,0.940354,0.935316
4,0.1941,0.197992,0.946255,0.941276,0.943759,0.938307
5,0.14,0.213521,0.949254,0.939749,0.944478,0.938464
6,0.1321,0.213771,0.949581,0.942974,0.946266,0.940195
7,0.1143,0.223523,0.953556,0.944331,0.948921,0.942398
8,0.1528,0.228847,0.952234,0.943992,0.948095,0.941769
9,0.1456,0.235019,0.950342,0.941955,0.94613,0.940038
10,0.114,0.255295,0.951378,0.943143,0.947243,0.941139


[I 2024-08-11 22:40:29,635] Trial 7 finished with value: 3.782903692865684 and parameters: {'learning_rate': 1.8624733487891571e-06, 'weight_decay': 0.009389605627120396, 'per_device_train_batch_size': 4, 'num_train_epochs': 20}. Best is trial 3 with value: 3.773132301056416.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2987,0.217858,0.925299,0.918703,0.921989,0.918634
2,0.2189,0.182022,0.956877,0.933978,0.945289,0.938464
3,0.1669,0.221659,0.93924,0.93924,0.93924,0.933743
4,0.1329,0.173048,0.951244,0.947047,0.949141,0.94413
5,0.0826,0.228011,0.942128,0.939409,0.940767,0.935631
6,0.0777,0.202802,0.954904,0.94518,0.950017,0.945231
7,0.0593,0.288153,0.951712,0.943313,0.947494,0.941297
8,0.0493,0.305264,0.952332,0.942634,0.947458,0.941926
9,0.037,0.312933,0.950471,0.941276,0.945851,0.940667


[I 2024-08-11 22:46:44,395] Trial 8 finished with value: 3.778266350873935 and parameters: {'learning_rate': 6.588702962738333e-05, 'weight_decay': 0.0056942564533528335, 'per_device_train_batch_size': 12, 'num_train_epochs': 15}. Best is trial 3 with value: 3.773132301056416.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at romainlhardy/roberta-large-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2542,0.214181,0.949549,0.929566,0.939451,0.932169


[I 2024-08-11 22:49:00,484] Trial 9 pruned. 


In [11]:
file_path = find_file(f"romainlhardy-roberta-large-finetuned-ner/run-{best_trials.run_id}", "trainer_state.json")
if file_path:
  metrics = json.load(open(file_path, 'r'))
else:
  print("Metrics not found.")

In [13]:
print(f"Best trial run: {best_trials.run_id}")
print(f"Best trial hyperparameters: {json.dumps(best_trials.hyperparameters, indent=2)}")
print(f"Best trial final F1: {metrics['best_metric']:.3}")

Best trial run: 3
Best trial hyperparameters: {
  "learning_rate": 1.0509220562206556e-06,
  "weight_decay": 0.004573106626696881,
  "per_device_train_batch_size": 8,
  "num_train_epochs": 20
}
Best trial final F1: 0.945


In [None]:
# @title Download optuna results
zipfile_name = "google-bert_bert-large-uncased_optuna_runs" # @param {"type":"string"}
!zip -rq {zipfile_name}.zip results -x "*.pt" "*.safetensors" "*.bin" "*.pth"
files.download(f"{zipfile_name}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>