## Environment Setup


In [38]:
try:
  import os
  import json
  from pathlib import Path
  import torch
  import numpy as np
  from google.colab import files

  import evaluate
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import optuna
  import wandb

  from nlpcw.utils import get_dataset, load_model, find_file, load_tokenizer, show_random_elements, tokenize_dataset
except:
  # TODO: temporary hack till colab upgrades pyarrow version
  %pip install -q pyarrow
  %pip install -q git+https://github.com/cogniveon/nlpcw.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for nlpcw (pyproject.toml) ... [?25l[?25hdone


In [None]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
%env TOKENIZERS_PARALLELISM true
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True
env: TOKENIZERS_PARALLELISM=true


True

In [None]:
# @title Config
MODEL_NAME = "google-bert/bert-large-uncased" # @param ["romainlhardy/roberta-large-finetuned-ner","google-bert/bert-base-uncased","romainlhardy/roberta-base-finetuned-ner","google-bert/bert-large-uncased", "romainlhardy/finetuned-ner", "romainlhardy/bert-finetuned-ner", "pucpr/biobertpt-all"] {"allow-input":true}
CHECKPOINT_PATH = None # @param {type:"raw"}
BATCH_SIZE = 8 # @param {type:"integer"}
NUM_EPOCHS = 20 # @param {type:"integer"}

## Train

In [None]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 153
    })
})

In [None]:
tokenizer = load_tokenizer(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
)

In [None]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [None]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    save_total_limit=2,
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.001,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to="wandb",
)


def model_init(trial):
    model, model_config = load_model(
        exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    return model


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = torch.nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 12]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [10, 15, 20]),
    }

trainer = CustomTrainer(
    args=args,
    model_init=model_init,
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# trainer.train()
best_trials = trainer.hyperparameter_search(
    backend="optuna",
    direction="minimize",
    hp_space=optuna_hp_space,
    n_trials=10,
)

[I 2024-08-10 15:19:30,829] A new study created in memory with name: no-name-70dee30c-e35f-4787-836f-21779c6c1d10
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4327,0.344707,0.890784,0.895926,0.893348,0.87876
2,0.3144,0.269798,0.916736,0.906152,0.911413,0.907467
3,0.3206,0.258001,0.914044,0.910275,0.912156,0.908994
4,0.2314,0.237235,0.928418,0.92413,0.926269,0.922126
5,0.1902,0.235386,0.93096,0.927429,0.929191,0.924874
6,0.167,0.245212,0.933876,0.927099,0.930475,0.925943
7,0.1642,0.239801,0.936142,0.930892,0.93351,0.928539
8,0.1735,0.243045,0.93804,0.931387,0.934702,0.929302
9,0.2129,0.2468,0.934992,0.929903,0.93244,0.927928
10,0.1631,0.254884,0.935071,0.928748,0.931899,0.927012


[I 2024-08-10 15:33:07,249] Trial 0 finished with value: 3.729565968458194 and parameters: {'learning_rate': 1.9722280617393704e-06, 'per_device_train_batch_size': 4, 'num_train_epochs': 20}. Best is trial 0 with value: 3.729565968458194.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3154,0.242785,0.927577,0.92314,0.925353,0.923194
2,0.2118,0.228391,0.938074,0.926934,0.932471,0.929149
3,0.1932,0.236185,0.940239,0.934191,0.937205,0.932356
4,0.1441,0.262645,0.945221,0.936335,0.940757,0.935105
5,0.0777,0.295025,0.941,0.933861,0.937417,0.93144
6,0.0345,0.355802,0.94511,0.93716,0.941118,0.936021
7,0.0318,0.39208,0.942241,0.936335,0.939279,0.933883
8,0.0432,0.442664,0.942359,0.935675,0.939005,0.933578
9,0.0232,0.468956,0.942104,0.936665,0.939376,0.933425


[I 2024-08-10 15:50:05,352] Trial 1 finished with value: 3.7515698802794955 and parameters: {'learning_rate': 1.1162417975298528e-05, 'per_device_train_batch_size': 4, 'num_train_epochs': 15}. Best is trial 0 with value: 3.729565968458194.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5035,0.329835,0.887175,0.901369,0.894216,0.882883
2,0.3237,0.260862,0.916209,0.908956,0.912568,0.908688
3,0.2428,0.255265,0.917507,0.915388,0.916446,0.913575
4,0.2346,0.230603,0.92922,0.928913,0.929066,0.925943
5,0.1791,0.23569,0.927737,0.929573,0.928654,0.925638
6,0.1689,0.237845,0.933853,0.929078,0.931459,0.928386
7,0.1459,0.240176,0.936542,0.927429,0.931963,0.928081
8,0.1384,0.242443,0.93395,0.932872,0.93341,0.929149
9,0.1119,0.25901,0.936191,0.929243,0.932704,0.928233
10,0.0945,0.27016,0.934635,0.931552,0.933091,0.929302


[I 2024-08-10 16:09:25,887] Trial 2 finished with value: 3.745886718202045 and parameters: {'learning_rate': 5.947006596423404e-06, 'per_device_train_batch_size': 12, 'num_train_epochs': 20}. Best is trial 0 with value: 3.729565968458194.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4545,0.359438,0.875101,0.895596,0.88523,0.872652
2,0.3284,0.275702,0.9124,0.905327,0.90885,0.903039
3,0.2808,0.264229,0.912784,0.907966,0.910369,0.906551
4,0.2517,0.237935,0.927159,0.927923,0.927541,0.923805
5,0.2331,0.232469,0.932616,0.929078,0.930844,0.927012
6,0.1948,0.231565,0.93435,0.929573,0.931955,0.928081
7,0.1888,0.232611,0.93581,0.930562,0.933179,0.928691
8,0.1974,0.231132,0.934603,0.931057,0.932827,0.928539
9,0.186,0.231682,0.936889,0.932872,0.934876,0.930218
10,0.1665,0.234366,0.935793,0.932707,0.934247,0.930066


[I 2024-08-10 16:20:48,036] Trial 3 finished with value: 3.7328132004274734 and parameters: {'learning_rate': 3.599156883343923e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 10}. Best is trial 0 with value: 3.729565968458194.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3142,0.241757,0.919928,0.92281,0.921367,0.916934
2,0.2394,0.233861,0.93616,0.921491,0.928767,0.924263
3,0.1939,0.237237,0.929388,0.92479,0.927083,0.924111
4,0.1585,0.240086,0.940863,0.931552,0.936184,0.932203
5,0.1285,0.263743,0.941714,0.935346,0.938519,0.933883
6,0.0851,0.29301,0.93413,0.928583,0.931348,0.927317
7,0.0757,0.319828,0.938433,0.932707,0.935561,0.931135
8,0.0649,0.330381,0.937956,0.932542,0.935241,0.930524


[I 2024-08-10 16:30:03,989] Trial 4 finished with value: 3.736262682925308 and parameters: {'learning_rate': 1.0435007120745892e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 15}. Best is trial 0 with value: 3.729565968458194.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4126,0.269362,0.91528,0.89807,0.906593,0.902581


[I 2024-08-10 16:32:22,233] Trial 5 pruned. 
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4695,0.360751,0.878812,0.893452,0.886072,0.871431
2,0.3463,0.278516,0.911352,0.900379,0.905833,0.90029
3,0.293,0.26512,0.915268,0.908626,0.911935,0.907314
4,0.2661,0.241546,0.923773,0.925449,0.924611,0.920751
5,0.2511,0.235317,0.929459,0.925779,0.927615,0.923194
6,0.2082,0.233872,0.933621,0.927923,0.930764,0.926554
7,0.1964,0.233511,0.933986,0.928748,0.93136,0.926706
8,0.2177,0.229383,0.933267,0.929573,0.931416,0.926554
9,0.2042,0.229727,0.933289,0.929903,0.931593,0.926706
10,0.1796,0.232013,0.934095,0.930397,0.932243,0.927164


[I 2024-08-10 16:43:43,931] Trial 6 finished with value: 3.7238995989557746 and parameters: {'learning_rate': 3.1704250757279784e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 10}. Best is trial 6 with value: 3.7238995989557746.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3655,0.292143,0.90758,0.900544,0.904048,0.895709
2,0.2871,0.252846,0.926899,0.913904,0.920355,0.914643
3,0.2896,0.246049,0.921715,0.918522,0.920116,0.916476
4,0.205,0.234406,0.932229,0.927923,0.930071,0.92579
5,0.1578,0.234963,0.935596,0.932047,0.933818,0.928539
6,0.1412,0.248212,0.938966,0.931222,0.935078,0.929608
7,0.1351,0.247679,0.93803,0.931222,0.934613,0.929149
8,0.1471,0.253085,0.939203,0.932542,0.93586,0.930371
9,0.1788,0.25963,0.938725,0.932377,0.93554,0.930218
10,0.1456,0.262507,0.938891,0.932542,0.935705,0.930371


[I 2024-08-10 16:57:40,056] Trial 7 finished with value: 3.737508848982428 and parameters: {'learning_rate': 3.0323316361543135e-06, 'per_device_train_batch_size': 4, 'num_train_epochs': 10}. Best is trial 6 with value: 3.7238995989557746.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3512,0.250901,0.921728,0.918687,0.920205,0.916781


[I 2024-08-10 17:00:01,782] Trial 8 pruned. 
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4391,0.287955,0.903575,0.908791,0.906175,0.897847


[I 2024-08-10 17:02:20,777] Trial 9 pruned. 


In [18]:
best_trials

BestRun(run_id='6', objective=3.7238995989557746, hyperparameters={'learning_rate': 3.1704250757279784e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 10}, run_summary=None)

In [26]:
file_path = find_file(f"results/run-{best_trials.run_id}", "trainer_state.json")
if file_path:
  metrics = json.load(open(file_path, 'r'))
else:
  print("Metrics not found.")

In [32]:
print(f"Best trial hyperparameters: {json.dumps(best_trials.hyperparameters, indent=2)}")
print(f"Best trial final F1: {metrics['best_metric']:.3}")

Best trial hyperparameters: {
  "learning_rate": 3.1704250757279784e-06,
  "per_device_train_batch_size": 8,
  "num_train_epochs": 10
}
Best trial final F1: 0.932


In [41]:
# @title Download results
zipfile_name = "google-bert_bert-large-uncased_optuna_runs" # @param {"type":"string"}
!zip -rq {zipfile_name}.zip results -x "*.pt" "*.safetensors" "*.bin" "*.pth"
files.download(f"{zipfile_name}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>