## Environment Setup


In [3]:
try:
  import os
  import torch
  import evaluate
  from nlpcw.utils import get_dataset, load_model, show_random_elements, tokenize_dataset
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import numpy as np
  import wandb
  from pathlib import Path
  import optuna
except:
  %pip install -q pyarrow
  %pip install -q git+https://github.com/cogniveon/nlpcw.git

In [4]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
%env TOKENIZERS_PARALLELISM true
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True
env: TOKENIZERS_PARALLELISM=true


<IPython.core.display.Javascript object>

wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


True

In [5]:
# @title Config
MODEL_NAME = "google-bert/bert-large-uncased" # @param ["romainlhardy/roberta-large-finetuned-ner","google-bert/bert-base-uncased","romainlhardy/roberta-base-finetuned-ner","google-bert/bert-large-uncased", "romainlhardy/finetuned-ner", "romainlhardy/bert-finetuned-ner", "pucpr/biobertpt-all"] {"allow-input":true}
CHECKPOINT_PATH = None # @param {type:"raw"}
BATCH_SIZE = 8 # @param {type:"integer"}
NUM_EPOCHS = 20 # @param {type:"integer"}

## Train

In [6]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
dataset

Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1072 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 153
    })
})

In [7]:
tokenizer, config_model, model, save_path = load_model(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
print(f"{model.name_or_path=}")
print(f"{str(save_path)=}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.name_or_path='google-bert/bert-large-uncased'
str(save_path)='experiments/vigorous-strategist-Itt3t'


In [8]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [9]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


args = TrainingArguments(
    output_dir=str(save_path),
    run_name=Path(save_path).name,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    save_total_limit=1,
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.001,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to="wandb",
)

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = torch.nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 12]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [10, 15, 20]),
    }


trainer = CustomTrainer(
    model,
    args,
    model_init=lambda _: load_model(
        exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )[2],
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]



In [10]:
# trainer.train()
best_trials = trainer.hyperparameter_search(
    backend="optuna",
    direction="minimize",
    hp_space=optuna_hp_space,
    n_trials=10,
)

[I 2024-08-10 13:11:05,545] A new study created in memory with name: no-name-909366d6-4e12-4b1c-9ec6-ff46cbd441dc
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5204,0.402855,0.858861,0.895266,0.876686,0.861506
2,0.3546,0.29515,0.908102,0.894772,0.901387,0.891281
3,0.3147,0.276684,0.909784,0.904833,0.907302,0.902122
4,0.2813,0.250519,0.918492,0.920007,0.919249,0.914949
5,0.2711,0.240699,0.926086,0.921656,0.923865,0.919377
6,0.2273,0.237092,0.929839,0.924625,0.927225,0.923194
7,0.2137,0.239338,0.928832,0.92347,0.926143,0.922278
8,0.2361,0.231898,0.932903,0.928748,0.930821,0.926706
9,0.2267,0.231136,0.932947,0.929408,0.931174,0.927317
10,0.2065,0.231827,0.932781,0.929243,0.931009,0.927012


[I 2024-08-10 13:21:18,532] Trial 0 finished with value: 3.7200450042922824 and parameters: {'learning_rate': 2.68562034979143e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 10}. Best is trial 0 with value: 3.7200450042922824.
[W 2024-08-10 13:21:18,537] Trial 1 failed with parameters: {'learning_rate': 3.2480454862904165e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 20} because of the following error: FileExistsError(17, 'File exists').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 211, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1891, in train
    self.model = self.call_model_init(trial)
  File "/usr/local/lib/python3.10/dist

FileExistsError: [Errno 17] File exists: 'experiments/dynamic-explorer-TpigT'