# Environment Setup


In [1]:
try:
  import os
  import torch
  import evaluate
  from nlpcw.utils import get_dataset, load_model, show_random_elements, tokenize_dataset
  from transformers import (
      Trainer,
      TrainingArguments,
      DataCollatorForTokenClassification,
      EarlyStoppingCallback,
  )
  import numpy as np
  import wandb
  from pathlib import Path
except:
  !pip install -q git+https://github.com/cogniveon/nlpcw.git

2024-08-09 21:23:18.631226: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-09 21:23:18.649418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-09 21:23:18.671751: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-09 21:23:18.678501: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-09 21:23:18.695741: I tensorflow/core/platform/cpu_feature_guar

In [2]:
%env WANDB_PROJECT=COMM061-NLP-CW
%env WANDB_LOG_MODEL=end
%env WANDB_SILENT=True
wandb.login()

env: WANDB_PROJECT=COMM061-NLP-CW
env: WANDB_LOG_MODEL=end
env: WANDB_SILENT=True


True

## Config


In [3]:
MODEL_NAME = "romainlhardy/roberta-large-finetuned-ner"
# MODEL_NAME = "google-bert/bert-base-uncased"
CHECKPOINT_PATH = None
# CHECKPOINT_PATH = "experiments/agile-navigator-qn9uu"
BATCH_SIZE = 8
NUM_EPOCHS = 20

%env TOKENIZERS_PARALLELISM true

env: TOKENIZERS_PARALLELISM=true


## Dataset


In [4]:
dataset, id2label, label2id, num_labels = get_dataset()
label_list = dataset["train"].features["ner_tags"].feature.names  # type: ignore
show_random_elements(dataset["train"])  # type: ignore

Unnamed: 0,tokens,ner_tags
0,"[BMI, ,, body, mass, index, ;, HR, ,, hazard, ratio, ;, SD, ,, standard, deviation, .]","[B-AC, B-O, B-LF, I-LF, I-LF, B-O, B-AC, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-LF, I-LF, B-O]"
1,"[It, was, found, that, an, increase, in, the, optical, density, (, OD, ), of, the, formazan, dye, directly, correlates, with, the, metabolic, activity, of, the, cells, following, exposure, to, either, Pt, or, -DissPt, .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-AC, B-O, B-AC, B-O]"
2,"[Transcriptome, sequencing, analysis, detected, 106, differentially, expressed, genes, (, DEGs, ), related, to, oxidative, stress, .]","[B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O]"
3,"[As, shown, in, Fig, 1, ,, the, genetic, distance, estimates, between, the, Striga, resistant, lines, and, the, resistant, tester, were, broader, in, the, range, of, GD, values, than, those, between, the, Striga, -, resistant, and, susceptible, tester, ,, indicating, the, resistant, tester, was, considered, as, a, suitable, tester, .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O]"
4,"[Pregnant, mice, were, intraperitoneally, injected, with, CdCl2, daily, from, gestational, day, (, GD)13, to, GD17, .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-O, B-O]"
5,"[,, reference, ;, R1, ,, positive, ;, R0, ,, negative, resection, margins, Malondialdehyde, acetaldehyde, -, modified, LDL, (, MAA, -, LDL, ), and, copper, -, oxidized, LDL, (, CuOx-LDL, ), were, prepared, as, described, [, 21, ], .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-O, B-AC, B-O, B-AC, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-AC, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O]"
6,"[Logistic, regression, analysis, showed, that, the, independent, factors, associated, with, mortality, were, postoperative, lactate, level, (, odds, ratio, [, OR, ], 1.926, ,, 95, %, confidence, interval, [, CI, ], 1.101–3.089, ,, p, =, 0.007, ), ,, postoperative, sequential, organ, failure, assessment, score, (, OR, 1.593, ,, 95, %, CI, 1.160–2.187, ,, p, =, 0.004, ), ,, and, DNI, on, POD1, (, OR, 1.118, ,, 95, %, CI, 1.028–1.215, ,, p, =, 0.009, ), .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O]"
7,"[), Third, ,, the, cross, -, correlation, analysis, across, different, frequencies, (, Fig, 2A, ), indicated, that, relationships, were, specific, to, the, theta, band, .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O]"
8,"[Those, incident, cases, with, sequelae, (, or, diseased, individuals, ), were, assigned, years, of, life, lost, (, YLLs, ), if, fatal, or, years, lived, with, disability, (, YLDs, ), with, a, disability, weight, (, DW, ), that, depended, on, the, severity, of, the, disease, .]","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-LF, I-LF, B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O]"
9,"[ADC, ,, Apparent, Diffusion, Coefficient, ;, IPca, ,, insignificant, prostate, cancer, ;, CSPca, ,, clinically, significant, prostate, cancer, .]","[B-AC, B-O, B-LF, I-LF, I-LF, B-O, B-AC, B-O, B-LF, I-LF, I-LF, B-O, B-AC, B-O, B-LF, I-LF, I-LF, I-LF, B-O]"


## Model


In [5]:
tokenizer, config_model, model, save_path = load_model(
    exp_or_model_name=MODEL_NAME if CHECKPOINT_PATH == None else CHECKPOINT_PATH,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
# tokenizer.save_pretrained(save_path)
# model.save_pretrained(save_path)
# config_model.save_pretrained(save_path)
print(f"{save_path=}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


save_path=PosixPath('experiments/keen-scholar-dpJoF')


## Dataset Exploration


In [6]:
example = dataset["train"][4]  # type: ignore
print(example["tokens"])
print(example["ner_tags"])

['Furthermore', ',', 'eNOS', '-', 'derived', 'NO', 'S', '-', 'nitrosylated', 'β', '-', 'actin', 'on', 'Cys374', 'and', 'impaired', 'actin', 'binding', 'to', 'profilin-1', '(', 'PFN1', ')', ',', 'as', 'confirmed', 'with', 'the', 'transnitrosylating', 'agent', 'S', '-', 'nitroso', '-', 'L', '-', 'cysteine', '(', 'Cys-NO', ')', '.']
[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 0, 1, 0, 0]


In [7]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])  # type: ignore
print(tokens)

['[CLS]', 'furthermore', ',', 'en', '##os', '-', 'derived', 'no', 's', '-', 'ni', '##tro', '##sy', '##lated', 'β', '-', 'act', '##in', 'on', 'cy', '##s', '##37', '##4', 'and', 'impaired', 'act', '##in', 'binding', 'to', 'prof', '##ili', '##n', '-', '1', '(', 'p', '##f', '##n', '##1', ')', ',', 'as', 'confirmed', 'with', 'the', 'trans', '##ni', '##tro', '##sy', '##lating', 'agent', 's', '-', 'ni', '##tro', '##so', '-', 'l', '-', 'cy', '##stein', '##e', '(', 'cy', '##s', '-', 'no', ')', '.', '[SEP]']


In [8]:
len(example[f"ner_tags"]), len(tokenized_input["input_ids"])  # type: ignore

(41, 70)

In [9]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"ner_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))  # type: ignore

70 70


## Training


In [10]:
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
tokenized_dataset

Tokenizing dataset:   0%|          | 0/126 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
})

In [11]:
metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    assert results != None
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


args = TrainingArguments(
    output_dir=str(save_path),
    run_name=Path(save_path).name,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.001,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],  # type: ignore
    eval_dataset=tokenized_dataset["validation"],  # type: ignore
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [12]:
%%wandb
trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.8293,0.353305,0.880895,0.896586,0.888671,0.873721
2,0.3965,0.265327,0.92183,0.900544,0.911063,0.901206
3,0.2561,0.233448,0.926667,0.917038,0.921827,0.918155
4,0.229,0.220069,0.930903,0.926604,0.928749,0.924263
5,0.2092,0.227226,0.931115,0.922975,0.927027,0.922126
6,0.1708,0.221797,0.933223,0.928913,0.931063,0.927164
7,0.1641,0.224207,0.935259,0.929243,0.932241,0.92747
8,0.154,0.227959,0.936326,0.928913,0.932605,0.928386
9,0.1512,0.226928,0.936244,0.930068,0.933146,0.928539
10,0.1338,0.228919,0.936753,0.930727,0.93373,0.929455
