In [3]:
import os
import pandas as pd
from transformers import AutoTokenizer
import torch
from datasets import Dataset

## Prepare dataset

In [None]:
PROJECT_ROOT = os.path.expanduser('~/Documents/github/biomed_extractor')
# Data directory at top level
TRAIN_DIR = os.path.join(PROJECT_ROOT, 'PICO_merged_train.txt')
TEST_DIR = os.path.join(PROJECT_ROOT, 'PICO_merged_test.txt')
DEV_DIR = os.path.join(PROJECT_ROOT, 'PICO_merged_dev.txt')

df_train = pd.read_csv(TRAIN_DIR, sep='\t', header=None)
df_test = pd.read_csv(TEST_DIR, sep='\t', header=None)
df_dev = pd.read_csv(DEV_DIR, sep='\t', header=None, quoting=3)

# rename columns to "word", "tag" for all dataframes
df_train.columns = ["word", "tag"]
df_test.columns = ["word", "tag"]
df_dev.columns = ["word", "tag"]

df_train.head()

Unnamed: 0,word,tag
0,-DOCSTART-,O
1,Title,O
2,:,O
3,Music,B-I
4,therapy,I-I


In [5]:
# re-assemble words into sequences (sentences)
def group_by_sentence(df):
    sentences = []
    current_sentence_words = []
    current_sentence_tags = []

    for index, row in df.iterrows():
        current_sentence_words.append(row["word"])
        current_sentence_tags.append(row["tag"])

        if row["word"] == ".":
            sentences.append({"word": current_sentence_words, "tag": current_sentence_tags})
            current_sentence_words = []
            current_sentence_tags = []

    # Add any remaining words as the last sentence if the file doesn't end with a period
    if current_sentence_words:
        sentences.append({"word": current_sentence_words, "tag": current_sentence_tags})

    return pd.DataFrame(sentences)

# Apply the grouping function to your DataFrames
df_train_grouped = group_by_sentence(df_train)
df_test_grouped = group_by_sentence(df_test)
df_dev_grouped = group_by_sentence(df_dev)

# Create Datasets from the grouped DataFrames
from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence, Value

dataset_train_grouped = Dataset.from_pandas(df_train_grouped)
dataset_test_grouped = Dataset.from_pandas(df_test_grouped)
dataset_dev_grouped = Dataset.from_pandas(df_dev_grouped)

# Combine into a DatasetDict
dataset_grouped = DatasetDict({
    "train": dataset_train_grouped,
    "test": dataset_test_grouped,
    "dev": dataset_dev_grouped
})

# Get the unique tags from the training dataframe for ClassLabel names
unique_tags = df_train["tag"].unique().tolist()

# Create a ClassLabel feature for your tags
tag_feature = ClassLabel(names=unique_tags)

# Define the features for the grouped dataset, including the ClassLabel for tags
grouped_features = Features({
    "word": Sequence(Value(dtype='string', id=None)),
    "tag": Sequence(tag_feature, id=None) # Apply ClassLabel to the sequence of tags
})

# Cast the grouped dataset to the defined features to apply the ClassLabel mapping
# Apply casting to all splits in the DatasetDict
dataset_grouped = dataset_grouped.cast(grouped_features)


# Display the first example in the train split of the new dataset
print(dataset_grouped["train"][0])
# Display the features to confirm ClassLabel is applied to the tag sequence
print(dataset_grouped["train"].features["tag"])

Casting the dataset:   0%|          | 0/8775 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1547 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1539 [00:00<?, ? examples/s]

{'word': ['-DOCSTART-', 'Title', ':', 'Music', 'therapy', 'in', 'moderate', 'and', 'severe', 'dementia', 'of', 'Alzheimer', "'", 's', 'type', ':', 'a', 'case', '-', 'control', 'study', '.'], 'tag': [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
List(ClassLabel(names=['O', 'B-I', 'I-I', 'B-P', 'I-P', 'B-O', 'I-O', 'B-C', 'I-C']))


In [6]:
# Get the unique tags from the original dataframe (assuming df_train is still available)
unique_tags = df_train["tag"].unique().tolist()

# Create a ClassLabel feature for your tags
tag_feature = ClassLabel(names=unique_tags)

# Define the features for the grouped dataset, including the ClassLabel for tags
# Note that the 'word' and 'tag' columns in dataset_grouped are lists
grouped_features = Features({
    "word": Sequence(Value(dtype='string', id=None)),
    "tag": Sequence(tag_feature, id=None) # Apply ClassLabel to the sequence of tags
})

# Cast the grouped dataset to the defined features to apply the ClassLabel mapping
dataset_grouped = dataset_grouped.cast(grouped_features)

# Display the first example in the modified dataset to verify the tags are now integers
print(dataset_grouped['train'][0])
# Display the features to confirm ClassLabel is applied to the tag sequence
print(dataset_grouped['train'].features["tag"])

Casting the dataset:   0%|          | 0/8775 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1547 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1539 [00:00<?, ? examples/s]

{'word': ['-DOCSTART-', 'Title', ':', 'Music', 'therapy', 'in', 'moderate', 'and', 'severe', 'dementia', 'of', 'Alzheimer', "'", 's', 'type', ':', 'a', 'case', '-', 'control', 'study', '.'], 'tag': [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
List(ClassLabel(names=['O', 'B-I', 'I-I', 'B-P', 'I-P', 'B-O', 'I-O', 'B-C', 'I-C']))


In [7]:
# Get the ClassLabel feature object from the dataset
tags = dataset_grouped['train'].features["tag"].feature
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
print(tag2index)
def create_tag_names(df):
  return {"ner_tags_str": [tags.int2str(idx) for idx in df["tag"]]}
dataset_grouped = dataset_grouped.map(create_tag_names)

{'O': 0, 'B-I': 1, 'I-I': 2, 'B-P': 3, 'I-P': 4, 'B-O': 5, 'I-O': 6, 'B-C': 7, 'I-C': 8}


Map:   0%|          | 0/8775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1539 [00:00<?, ? examples/s]

In [8]:
example = dataset_grouped['train'][0]
pd.DataFrame([example["word"], example['tag'], example["ner_tags_str"]],['Tokens', 'Tags', 'Tags_decode'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
Tokens,-DOCSTART-,Title,:,Music,therapy,in,moderate,and,severe,dementia,...,',s,type,:,a,case,-,control,study,.
Tags,0,0,0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tags_decode,O,O,O,B-I,I-I,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O


In [9]:
#As a quick check that we don’t have any unusual imbalance in the tags, let’s calculate
#the frequencies of each entity across each split:
from collections import Counter
from collections import defaultdict
split2freqs = defaultdict(Counter)
for split, dataset in dataset_grouped.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,I,P,O,C
train,5880,2150,5799,1002
test,1100,354,918,164
dev,1021,390,1044,179


## Get model

In [10]:
# using mobilebert as model.
from transformers import AutoTokenizer
model_name = "nlpie/bio-mobilebert"
model_tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
from transformers import AutoConfig
# When we load a model with AutoModel.from_pretrained(model_ckpt), the configuration file
# associated with that model is downloaded automatically. However, if we want to modify
# something like the number of classes or label names, then we can load the configuration
# first with the parameters we would like to customize
model_config = AutoConfig.from_pretrained(model_name,
 num_labels=tags.num_classes,
 id2label=index2tag, label2id=tag2index)

config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

In [12]:
import torch
from transformers import AutoModelForTokenClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Use AutoModelForTokenClassification to load the model
my_model = (AutoModelForTokenClassification
 .from_pretrained(model_name, config=model_config)
 .to(device))

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at nlpie/bio-mobilebert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from datasets import Sequence, Value, Features # Import necessary classes
from transformers import AutoTokenizer # Ensure AutoTokenizer is imported if not already

# Assume model_tokenizer is already defined as AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples, tag2index, index2tag):
    # Process each example in the batch
    tokenized_inputs = model_tokenizer([
        " ".join([word for word in words if word is not None]) for words in examples["word"]
    ], truncation=True, padding='max_length', max_length=512) # Explicitly set padding and max_length

    all_labels = []
    for i in range(len(examples["word"])):
        example_tags = examples["tag"][i] # Get the list of tag IDs for the current example
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Get word_ids for the current example
        previous_word_idx = None
        label_ids = []

        # Create a list of original tag IDs aligned with token IDs, padding with -100
        aligned_labels = [-100] * len(word_ids) # Initialize with -100

        for token_idx, word_idx in enumerate(word_ids):
            # Special tokens have a word ID that is None. We keep the label at -100.
            if word_idx is None:
                continue # Keep -100

            # Add check for out-of-bounds word_idx relative to original words/tags
            if word_idx >= len(example_tags):
                 continue # Keep -100 for tokens beyond original words

            # We set the label for the first token of each word.
            # Check if this token corresponds to a new word compared to the previous token
            if previous_word_idx != word_idx:
                 # If it's the first token of a word, use the original word's tag ID
                 aligned_labels[token_idx] = example_tags[word_idx]

            # For subsequent tokens of the same word, use the original word's tag ID
            # if it's an 'I-' tag, otherwise use -100.
            # This part is crucial for subword tokenization: subsequent tokens of a word
            # should get the same label as the first token *if* that label is an 'I-' tag.
            # If the original label is B-, O, or C-, subsequent tokens of the same word get -100.
            else:
                 # Use the tag ID of the current word_idx
                 tag_id = example_tags[word_idx]
                 # Get the string representation of the tag
                 tag_name = index2tag.get(tag_id)

                 # If the tag is an 'I-' tag, assign it to the subsequent token.
                 # Otherwise, the label remains -100 (from initialization).
                 if tag_name and tag_name.startswith("I-"):
                     aligned_labels[token_idx] = tag_id
                 # Else, the label remains -100 (correct for B-, O-, C- tags on subsequent tokens)


            previous_word_idx = word_idx
        all_labels.append(aligned_labels)

    tokenized_inputs["labels"] = all_labels # Return a list of label_ids for the batch
    return tokenized_inputs

# Define the features for the tokenized dataset explicitly
# Start with an empty Features object and add the expected output features
# The sequences will now be padded to max_length
tokenized_features = Features({
    "input_ids": Sequence(Value("int64"), length=512), # Specify length after padding
    "token_type_ids": Sequence(Value("int64"), length=512), # Specify length
    "attention_mask": Sequence(Value("int64"), length=512), # Specify length
    "labels": Sequence(Value("int64"), length=512), # Specify length
})


# Apply the tokenization and alignment function to all splits in the DatasetDict
# Remove original columns that are no longer needed after tokenization
tokenized_dataset = dataset_grouped.map(
    tokenize_and_align_labels,
    batched=True, # Set batched to True
    features=tokenized_features, # Apply features to ensure consistent shapes
    fn_kwargs={"tag2index": tag2index, "index2tag": index2tag},
    remove_columns=["word", "tag", "ner_tags_str"] # Specify columns to remove
)

# Display the first example of the tokenized training set
print(tokenized_dataset["train"][0])
# Print shapes to verify
print("Input IDs shape:", torch.tensor(tokenized_dataset["train"][0]['input_ids']).shape)
print("Labels shape:", torch.tensor(tokenized_dataset["train"][0]['labels']).shape)

Map:   0%|          | 0/8775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1539 [00:00<?, ? examples/s]

{'input_ids': [101, 1011, 9986, 14117, 2102, 1011, 2516, 1024, 2189, 7242, 1999, 8777, 1998, 5729, 28767, 1997, 21901, 1005, 1055, 2828, 1024, 1037, 2553, 1011, 2491, 2817, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Performance
Now that we have a model and a dataset, we need to define a performance metric.

In [29]:
import numpy as np
def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], []
  for batch_idx in range(batch_size):
    example_labels, example_preds = [], []
    for seq_idx in range(seq_len):
      # Ignore label IDs = -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
    labels_list.append(example_labels)
    preds_list.append(example_preds)
  return preds_list, labels_list

## Train the model

In [30]:
#we’ll use the Transformers Trainer to handle our training loop, so first we need to define the training attributes using the TrainingArguments class
from transformers import TrainingArguments
num_epochs = 3
batch_size = 8 # Reduced batch size to mitigate OutOfMemoryError
logging_steps = len(tokenized_dataset["train"]) // batch_size
model_output = "./bio-mobilebert-finetuned-PICO" # Changed output_dir to a simpler local path
training_args = TrainingArguments(
 output_dir=model_output,
 log_level="error",
 num_train_epochs=num_epochs,
 per_device_train_batch_size=batch_size,
 per_device_eval_batch_size=batch_size,
 eval_strategy="epoch",
 save_steps=1e6,
 weight_decay=0.01,
 disable_tqdm=False,
 logging_steps=logging_steps,
 push_to_hub=False,
 report_to="none") # Added this argument to disable logging to external services)

In [31]:
!pip install seqeval



In [32]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions,
  eval_pred.label_ids)
  return {"f1": f1_score(y_true, y_pred)}

In [33]:
#The final step is to define a data collator so we can pad each input sequence to the largest sequence length in a batch
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(model_tokenizer)

In [None]:
# We will train several models, so we’ll avoid initializing
# a new model for every Trainer by creating a model_init() method. This method loads
# an untrained model and is called at the beginning of the train() call:
def model_init():
  return (AutoModelForTokenClassification
  .from_pretrained(model_name, config=model_config)
  .to(device))

In [35]:
from transformers import Trainer
trainer = Trainer(model_init=model_init, args=training_args,
 data_collator=data_collator, compute_metrics=compute_metrics,
 train_dataset=tokenized_dataset["train"],
 eval_dataset=tokenized_dataset["dev"],
 tokenizer=model_tokenizer)

  trainer = Trainer(model_init=model_init, args=training_args,


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3678,0.160769,0.72429
2,0.1129,0.058811,0.870944
3,0.0479,0.032386,0.927505


TrainOutput(global_step=3291, training_loss=0.17610991222993902, metrics={'train_runtime': 1231.0524, 'train_samples_per_second': 21.384, 'train_steps_per_second': 2.673, 'total_flos': 1651092219878400.0, 'train_loss': 0.17610991222993902, 'epoch': 3.0})