In [16]:
#Import the library used in this notebook, (some might not be used here)
import os
import re
import itertools
import shutil
import glob
import json
import pickle
import dill
import random

import pandas as pd
import numpy as np

import torch
torch.cuda.empty_cache()

from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import set_seed
set_seed(42) # THAT'S THE PARAMETER TO PLAY WITH FOR DIFFERENT WEIGHT START
import accelerate

from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

In [17]:
#Seting up the task name variable, it's aesthetic but will make it easier to follow the data
task = "ner"

#That will be the name of the column of the data we generate with the tokens.
label_column_name = f"{task}_tags"

#Here is the name of the class you want to identity. Here I am interested in extracting phenotype entity.


annotation_set = ["Pathway"] # CHANGE HERE FOR YOUR CATEGROY PATHWAY but I don't remember the capitalisation




#Here we generate the BIO2 tags
entity_types = ["O"] + [[f"B-{ele}", f"I-{ele}"] for ele in annotation_set]

#We change the list to become a dictionary. The key is the BIO2 tag and we assigne a numerical value to it, i.e. 'O' become 0
label_list = [x for xs in entity_types for x in xs]
label_encoding_dict = {key:idx for idx, key in enumerate(label_list)}
print(label_encoding_dict)

#Name of the model we want to use from HuggingFace
model_checkpoint = "allenai/scibert_scivocab_cased"  # NAME OF YOU HUGGING FACE MODEL 
#Number of document used by batch when training
batch_size = 32

{'O': 0, 'B-Pathway': 1, 'I-Pathway': 2}


In [18]:
#These functions will take 3 directories as input, where BIO2 files are stored as token /t label, and will return a Dataset instead.
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list]
    return pd.DataFrame({'tokens': tokens, label_column_name: entities})

def get_un_token_dataset(train_directory, validation_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    #test_df = get_all_tokens_and_ner_tags(test_directory)
    validation_df = get_all_tokens_and_ner_tags(validation_directory)
    train_dataset = Dataset.from_pandas(train_df)
    #test_dataset = Dataset.from_pandas(test_df)
    validation_dataset = Dataset.from_pandas(validation_df)

    return (train_dataset,  validation_dataset)

In [13]:
#Generate the files PUT YOUR PATH HERE
train_dataset, validation_dataset = get_un_token_dataset('./Train_separated_data/train', './Train_separated_data/val1')

In [77]:
def get_all_tokens_and_ner_tags(directory, file_limit=None):
    # List files in the directory
    files = os.listdir(directory)
    
    # Apply the limit
    if file_limit is not None:
        #files = files[-file_limit:]
        files = random.sample(files, min(file_limit, len(files)))
    
    # Concatenate DataFrames from each file
    dfs = [get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in files]
    return pd.concat(dfs).reset_index(drop=True)

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list]
    return pd.DataFrame({'tokens': tokens, label_column_name: entities})

def get_un_token_dataset(train_directory, validation_directory, train_limit=None, validation_limit=None):
    train_df = get_all_tokens_and_ner_tags(train_directory, file_limit=train_limit)
    validation_df = get_all_tokens_and_ner_tags(validation_directory, file_limit=validation_limit)
    
    train_dataset = Dataset.from_pandas(train_df)
    validation_dataset = Dataset.from_pandas(validation_df)

    return train_dataset, validation_dataset

# Specify the limits
train_limit = 15
validation_limit = 5

train_dataset, validation_dataset = get_un_token_dataset('./Train_separated_data/train', './Train_separated_data/val1', train_limit=train_limit, validation_limit=validation_limit)

In [78]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 7465
})

In [64]:
validation_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 70703
})

In [16]:
pickle.dump(train_dataset, open('./Train_separated_data/train/train_pickle/train.pickle', 'wb'))

In [17]:
pickle.dump(validation_dataset, open('./Train_separated_data/val1/val1_pickle/train_val1.pickle', 'wb'))

In [7]:
with open('./Train_separated_data/train/train_pickle/train.pickle', 'rb') as f:
    train_dataset = pickle.load(f)

In [8]:
with open('./Train_separated_data/val1/val1_pickle/train_val1.pickle', 'rb') as f:
    validation_dataset = pickle.load(f)

### Model training

In [79]:
max_length=512 # Should be less than the models value (Usually less than 512)

#Here we can cache the tokenizer so it will take less time to load next time
cache_dir = "./cache"
if not(os.path.exists(cache_dir)):
    os.makedirs(cache_dir)

In [81]:
#Here we are collecting the information from the tokenizer of the model we are plaining on retraining from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    cache_dir=cache_dir, # Remove this if you do not have enough space in your disk
    use_fast=True,
    truncation=True, #we skip any token > max_model token length
    is_split_into_words=True,
    max_length=max_length
)

In [82]:
#This function will align the correct token to the correct label. Sometimes words need to be cut into tokens, this function will make sure that a labels are also assign to the tokens.
#This way you keep the same number of tokens and labels
def tokenize_and_align_labels(examples):
    label_all_tokens = False
    tokenized_inputs = tokenizer(
        list(examples["tokens"]),
        truncation=True,
        is_split_into_words=True,
        max_length=max_length
    )

    labels = []

    iter_examples = examples[label_column_name]

    for i, single_sample in enumerate(iter_examples):
        label = single_sample

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#Generate the files
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
validation_tokenized_datasets = validation_dataset.map(tokenize_and_align_labels, batched=True)
#test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 7465/7465 [00:00<00:00, 13154.56 examples/s]
Map: 100%|██████████| 1730/1730 [00:00<00:00, 11239.84 examples/s]


In [83]:
print("Example tokenized input:", train_tokenized_datasets[0])

Example tokenized input: {'tokens': ['Crosstalk', 'Mechanisms', 'Between', 'HGF', '/', 'c', '-', 'Met', 'Axis', 'and', 'ncRNAs', 'in', 'Malignancy'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'input_ids': [101, 26904, 2658, 473, 160, 26341, 1338, 116, 578, 422, 4943, 136, 10624, 19304, 140, 124, 15866, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 0, 0, -100, -100, 0, 0, -100]}


In [15]:
# Save the tokenized datasets to pickle files
with open('train_tokenized_datasets.pickle', 'wb') as f:
    pickle.dump(train_tokenized_datasets, f)

with open('validation1_tokenized_datasets.pickle', 'wb') as f:
    pickle.dump(validation_tokenized_datasets, f)

  0%|          | 3846/5231615 [19:55<451:23:30,  3.22it/s]


KeyboardInterrupt: 

In [44]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
args = TrainingArguments(
    f"test-{task}",
    learning_rate=8e-5, #You need to change this parameter during training
    per_device_train_batch_size=batch_size, #You need to change this parameter during training
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5, #You need to change this parameter during training
    weight_decay=1e-5, #You need to change this parameter during training
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    seed=0
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")


GPU is available!
Number of GPUs: 1
GPU Name: NVIDIA GeForce RTX 4090


In [2]:
import torch
print(torch.version.cuda)


11.8


In [59]:
#Here we import the model we want to retrain from HuggingFace
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

#We put here the arguments we want to use when we retraining the model
args = TrainingArguments(
    f"test-{task}",
    learning_rate=8e-5, #You need to change this parameter during training
    per_device_train_batch_size=batch_size, #You need to change this parameter during training
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1, #You need to change this parameter during training
    weight_decay=1e-5, #You need to change this parameter during training
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    save_strategy="no",
    seed=0
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval", trust_remote_code=True)

#Create the function that will evaluate the performance of the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

from transformers import TrainerCallback

class ContiguousSaveCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Ensure tensors are contiguous before saving
        for name, param in model.named_parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

# Create Trainer instance with manual save callback
manual_save_path = './pathway_model_seed_zero'
# Create Trainer instance with the custom callback
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=validation_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ContiguousSaveCallback]
)

""" #Train the model
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=validation_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
) """

trainer.train()
trainer.evaluate()
#Saving the retrained model, please feel free to change the name to something more easy to know what the model is about
#trainer.save_model('./pathway_model_seed_zero.model')
model_path = './pathway_model_seed_zero.model'
torch.save(model.state_dict(), model_path)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SyntaxError: keyword argument repeated: save_strategy (4208114738.py, line 15)

In [84]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback
import torch
import numpy as np
from datasets import load_metric

# Initialize model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define training arguments with no automatic saving
args = TrainingArguments(
    output_dir=f"test-{task}",
    learning_rate=8e-5,  # You need to change this parameter during training
    per_device_train_batch_size=batch_size,  # You need to change this parameter during training
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,  # You need to change this parameter during training
    weight_decay=1e-5,  # You need to change this parameter during training
    evaluation_strategy="epoch",
    save_strategy="no",  # Disable automatic saving
    seed=0
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval", trust_remote_code=True)

# Create the function that will evaluate the performance of the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    print("Sample Predictions and Labels:")
    for i in range(5):
        print(f"Predictions: {true_predictions[i]}")
        print(f"Labels: {true_labels[i]}")

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

# Create a callback to ensure tensors are contiguous before saving
class ContiguousSaveCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Ensure tensors are contiguous before saving
        for name, param in model.named_parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

# Create Trainer instance with the custom callback
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=validation_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ContiguousSaveCallback]
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model manually
model_path = './pathway_model15_seed_zero.model'
torch.save(model.state_dict(), model_path)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 32%|███▏      | 716/2210 [35:44<1:14:33,  2.99s/it]
100%|█████████▉| 233/234 [00:15<00:00, 15.28it/s]

Sample Predictions and Labels:
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 234/234 [00:17<00:00, 13.31it/s]


{'eval_loss': 0.016801366582512856, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9972952354532214, 'eval_runtime': 1.5852, 'eval_samples_per_second': 1091.347, 'eval_steps_per_second': 34.696, 'epoch': 1.0}
{'train_runtime': 17.5781, 'train_samples_per_second': 424.677, 'train_steps_per_second': 13.312, 'train_loss': 0.024195234999697432, 'epoch': 1.0}


 89%|████████▉ | 49/55 [00:00<00:00, 57.39it/s]

Sample Predictions and Labels:
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Pathway', 'O', 'O', 'O', 'O']
Predictions: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

100%|██████████| 55/55 [00:01<00:00, 35.24it/s]


In [52]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_metric

# Initialize the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define training arguments
args = TrainingArguments(
    output_dir=f"test-{task}",
    learning_rate=8e-5,
    save_strategy="no", 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=1e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=0
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval", trust_remote_code=True)

# Define metrics computation function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

class ContiguousSaveCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        ensure_contiguous_tensors(model)

def ensure_contiguous_tensors(model):
    for name, param in model.named_parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

# Create Trainer instance with custom callback
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=validation_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ContiguousSaveCallback]
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()

# Save the model manually
#ensure_contiguous_tensors(model)
#torch.save(model.state_dict(), './pathway_model_seed_zero.model')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1083/1083 [11:25<00:00,  1.58it/s]
100%|██████████| 1083/1083 [05:27<00:00,  3.31it/s]
 46%|████▋     | 503/1083 [00:38<00:42, 13.57it/s]

{'loss': 0.0076, 'grad_norm': 0.0009288450237363577, 'learning_rate': 4.306555863342567e-05, 'epoch': 0.46}


 92%|█████████▏| 1001/1083 [01:17<00:06, 12.71it/s]

{'loss': 0.0004, 'grad_norm': 0.0003641198854893446, 'learning_rate': 6.1311172668513395e-06, 'epoch': 0.92}


100%|██████████| 1083/1083 [01:23<00:00, 14.93it/s]

ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

100%|██████████| 1083/1083 [01:35<00:00, 14.93it/s]

In [53]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AutoTokenizer, AdamW, get_scheduler
from datasets import load_metric

# Initialize model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define metric
metric = load_metric("seqeval", trust_remote_code=True)

# Define DataLoader
train_dataloader = DataLoader(train_tokenized_datasets, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(validation_tokenized_datasets, batch_size=batch_size)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            total_loss += outputs.loss.item()
            logits = outputs.logits
            predictions.extend(logits.argmax(dim=-1).cpu().numpy())
            true_labels.extend(batch["labels"].cpu().numpy())
    return total_loss / len(dataloader), predictions, true_labels


# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_train_epochs = 1
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=8e-5)
num_training_steps = len(train_dataloader) * num_train_epochs
scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training and Evaluation
for epoch in range(num_train_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss}")
    
    eval_loss, predictions, true_labels = evaluate(model, eval_dataloader, device)
    print(f"Epoch {epoch+1}: Eval Loss = {eval_loss}")

    # Compute metrics
    true_predictions = [[label_list[p] for p in prediction] for prediction in predictions]
    true_labels = [[label_list[l] for l in label] for label in true_labels]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    print(f"Epoch {epoch+1}: Precision = {results['overall_precision']}, Recall = {results['overall_recall']}, F1 = {results['overall_f1']}, Accuracy = {results['overall_accuracy']}")

# Save the model
model.save_pretrained('./pathway_model_seed_zero')
tokenizer.save_pretrained('./pathway_model_seed_zero')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: each element in list of batch should be of equal size

In [None]:
#Here we import the model we want to retrain from HuggingFace
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

#We put here the arguments we want to use when we retraining the model
args = TrainingArguments(
    f"test-{task}",
    learning_rate=8e-5, #You need to change this parameter during training
    per_device_train_batch_size=batch_size, #You need to change this parameter during training
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5, #You need to change this parameter during training
    weight_decay=1e-5, #You need to change this parameter during training
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    seed=0
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

#Create the function that will evaluate the performance of the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

#Here we import the model we want to retrain from HuggingFace
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

best_model = None
best_score = float('-inf')

for seed in range(5, 11):  # Using seeds from 5 to 10
    set_seed(seed)
    
    args = TrainingArguments(
        f"test-{task}-seed-{seed}",
        learning_rate=8e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=5,
        weight_decay=1e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        seed=seed  # Set the seed here
    )

    #Train the model
    trainer = Trainer(
        model,
        args,
        train_dataset=train_tokenized_datasets,
        eval_dataset=validation_tokenized_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    score = eval_results["eval_accuracy"]  
    
    # Save the best model
    if score > best_score:
        best_score = score
        best_model = trainer.model
        best_seed = seed

print(f"Best model achieved with seed {best_seed} and accuracy {best_score}")

#Saving the retrained model, please feel free to change the name to something more easy to know what the model is about
#trainer.save_model('./track2_id2_en_scibert.model')