In [1]:
import numpy as np
import evaluate
import utils

metric = evaluate.load("seqeval")

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from C:\Users\Anacl\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--seqeval\541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Tue Apr 18 00:53:25 2023) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [2]:
from datasets import load_from_disk
from transformers import AutoModelForTokenClassification, Trainer, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, AutoConfig

dataset = load_from_disk("./data/ncbi.hf")
features = dataset["train"].features
label_list = features['ner_tags'].feature.names
id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}

b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)


In [3]:
model = AutoModelForTokenClassification.from_pretrained(
                                                        './models/BERT',
                                                        cache_dir=None,
                                                        num_labels=len(label_list), 
                                                        id2label=id2label, 
                                                        label2id=label2id,
                                                        token=None,
                                                        )

trainer_args = TrainingArguments(
                                optim="adamw_torch",
                                num_train_epochs=5,
                                output_dir = str("./output"),
                                evaluation_strategy="no",
                                save_strategy="no",
                                do_eval=False,
                                seed=100,
                                full_determinism=True)

tokenizer = AutoTokenizer.from_pretrained("./models/BERT", local_files_only=True, padding=True, num_labels=3)

data_collator = DataCollatorForTokenClassification(tokenizer)

train_dataset = dataset['train'].map(
                        utils.tokenize_and_align_labels,
                        batched=True,
                        desc="Running tokenizer on train dataset", 
                        fn_kwargs={"tokenizer": tokenizer, "b_to_i_label": b_to_i_label}
        ).remove_columns(['ner_tags', 'id', 'tokens'])

print(train_dataset)


trainer = Trainer(
            model=model.to("cuda:0"),
            train_dataset=train_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=utils.compute_metrics_wrapper(label_list, metric),
            args = trainer_args,
    )

t = trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./models/BERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Running tokenizer on train dataset: 100%|██████████| 5425/5425 [00:01<00:00, 3708.08 examples/s]


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 5425
})


 15%|█▍        | 500/3395 [01:54<10:06,  4.77it/s]

{'loss': 0.1006, 'learning_rate': 4.263622974963181e-05, 'epoch': 0.74}


 29%|██▉       | 1001/3395 [03:40<08:18,  4.80it/s]

{'loss': 0.041, 'learning_rate': 3.527245949926363e-05, 'epoch': 1.47}


 44%|████▍     | 1500/3395 [05:29<06:43,  4.70it/s]

{'loss': 0.0241, 'learning_rate': 2.7908689248895437e-05, 'epoch': 2.21}


 59%|█████▉    | 2000/3395 [07:17<04:57,  4.68it/s]

{'loss': 0.0126, 'learning_rate': 2.0544918998527246e-05, 'epoch': 2.95}


 74%|███████▎  | 2500/3395 [09:02<03:04,  4.86it/s]

{'loss': 0.0065, 'learning_rate': 1.3181148748159059e-05, 'epoch': 3.68}


 88%|████████▊ | 3000/3395 [10:48<01:23,  4.74it/s]

{'loss': 0.0035, 'learning_rate': 5.817378497790869e-06, 'epoch': 4.42}


100%|██████████| 3395/3395 [12:13<00:00,  4.63it/s]

{'train_runtime': 733.1539, 'train_samples_per_second': 36.998, 'train_steps_per_second': 4.631, 'train_loss': 0.027909896563360022, 'epoch': 5.0}





In [6]:
train_metrics = []
metrics = t.metrics
train_metrics.append(t.metrics)

max_train_samples = len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))


In [7]:
metrics

{'train_runtime': 733.1539,
 'train_samples_per_second': 36.998,
 'train_steps_per_second': 4.631,
 'train_loss': 0.027909896563360022,
 'epoch': 5.0,
 'train_samples': 5425}