## Environment Setup


In [1]:
try:
  import os
  import torch
  import evaluate
  from nlpcw.utils import get_dataset, load_model, show_random_elements, tokenize_dataset
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import numpy as np
  import wandb
  from pathlib import Path
except:
  %pip install -q git+https://github.com/cogniveon/nlpcw.git

2024-08-10 12:28:06.867483: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-10 12:28:06.885640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-10 12:28:06.907507: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-10 12:28:06.914279: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-10 12:28:06.931294: I tensorflow/core/platform/cpu_feature_guar

In [2]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
%env TOKENIZERS_PARALLELISM true
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True
env: TOKENIZERS_PARALLELISM=true


True

In [3]:
# @title Config
MODEL_NAME = "dslim/bert-base-NER" # @param ["romainlhardy/roberta-large-finetuned-ner","google-bert/bert-base-uncased","romainlhardy/roberta-base-finetuned-ner","google-bert/bert-large-uncased", "romainlhardy/finetuned-ner", "romainlhardy/bert-finetuned-ner", "pucpr/biobertpt-all"] {"allow-input":true}
CHECKPOINT_PATH = None # @param {type:"raw"}
BATCH_SIZE = 8 # @param {type:"integer"}
NUM_EPOCHS = 20 # @param {type:"integer"}

## Train

In [4]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 153
    })
})

In [5]:
tokenizer, config_model, model, save_path = load_model(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
print(f"{model.name_or_path=}")
print(f"{str(save_path)=}")

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

model.name_or_path='dslim/bert-base-NER'
str(save_path)='experiments/energetic-architect-XweUt'


In [6]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/1072 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [7]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


args = TrainingArguments(
    output_dir=str(save_path),
    run_name=Path(save_path).name,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    save_total_limit=1,
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.001,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = torch.nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [8]:
%%wandb
trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.7541,0.6044,0.782239,0.835629,0.808053,0.781999
2,0.5375,0.437109,0.850858,0.868973,0.85982,0.845709
3,0.4517,0.368737,0.877423,0.878522,0.877972,0.869113
4,0.4247,0.335073,0.894312,0.883532,0.888889,0.881393
5,0.3717,0.31404,0.902965,0.891515,0.897204,0.890205
6,0.3352,0.300073,0.905577,0.894803,0.900157,0.895117
7,0.3128,0.289735,0.907369,0.900125,0.903733,0.899595
8,0.3299,0.282894,0.911188,0.90263,0.906889,0.902918
9,0.3439,0.277371,0.912543,0.906544,0.909534,0.904941
10,0.3146,0.273493,0.914475,0.908892,0.911675,0.906819
