In [1]:
# Installing necessary packages
!pip install transformers datasets tokenizers seqeval -q

In [2]:
# Importing necessary libraries
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [3]:
# Loading dataset from the Hugging Face datasets library
conllpp = datasets.load_dataset("conllpp")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Displaying the details of the dataset
conllpp

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
# Accessing and displaying the first example from the training split of the dataset
conllpp["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
# Accessing and displaying the features of the 'ner_tags' field from the training split
conllpp["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
# Initializing a BERT tokenizer using the bert-base-uncased pre-trained model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")



In [8]:
# Getting the first example from the training split of the 'conllpp' dataset and storing it in the 'text' variable
text = conllpp["train"][0]

# Tokenize the 'tokens' field of the 'text' example using the BERT tokenizer
tokenized_input = tokenizer(text['tokens'], is_split_into_words=True)
# Converting the token IDs in the 'input_ids' field of the 'tokenized_input' to tokens using the BERT tokenizer
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

# Getting the word IDs for each token in the 'tokenized_input'
word_ids = tokenized_input.word_ids()
print(word_ids)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [9]:
# Displaying the tokenized_input
tokenized_input

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# Converting the token IDs in the 'input_ids' field of the 'tokenized_input' to tokens using the BERT tokenizer
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [11]:
# Checking the lengths of tokens and NER tags for the first example in the training set
len(tokens), len(conllpp["train"][0]["ner_tags"])

(11, 9)

In [12]:
# Defining a function to tokenize input examples and align the labels with the tokens.
def tokenize_and_align_labels(examples, label_all_tokens=True):
    # Tokenize the input examples
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    # Iterate over each example
    for i, label in enumerate(examples["ner_tags"]):
        # Retrieve the word IDs for the tokens in the current example
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        # Initialize variables
        previous_word_idx = None
        label_ids = []

        # Iterate over each token's word ID and corresponding label
        for word_idx in word_ids:
            if word_idx is None:
                # Handle special tokens by setting the label to -100
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # If the word index is different from the previous one,
                # it's a new word, so assign the corresponding label
                label_ids.append(label[word_idx])
            else:
                # If the word index is the same as the previous one,
                # it's a sub-word, so handle it based on the label_all_tokens flag
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx

        # Append the label IDs for the current example
        labels.append(label_ids)

    # Add the labels to the tokenized inputs
    tokenized_inputs["labels"] = labels

    return tokenized_inputs


In [13]:
# Accessing and display the fifth example from the training split of the 'conllpp' dataset.
conllpp["train"][4:5]

{'id': ['4'],
 'tokens': [['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.']],
 'pos_tags': [[22,
   27,
   21,
   35,
   12,
   22,
   22,
   27,
   16,
   21,
   22,
   22,
   38,
   15,
   22,
   24,
   20,
   37,
   21,
   15,
   24,
   16,
   15,
   22,
   15,
   12,
   16,
   21,
   38,
   17,
   7]],
 'chunk_tags': [[11,
   11,
   12,
   13,
   11,
   12,
   12,
   11,
   12,
   12,
   12,
   12,
   21,
   13,
   11,
   12,
   21,
   22,
   11,
   13,
   11,
   1,
   13,
   11,
   17,
   11,
   12,
   12,
   21,
   1,
   0]],
 'ner_tags': [[5,
   0,
   0,
   0,
   0,
   3,
   4,
   0,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,


In [14]:
data = tokenize_and_align_labels(conllpp["train"][4:5])
print(data)

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [15]:
# Now applying tokenize_and_align_labels on the entire data
tokenized_data = conllpp.map(tokenize_and_align_labels, batched=True)

In [16]:
# Displaying tokenized_data
tokenized_data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

In [17]:
# Loading a pre-trained BERT model for token classification
# The model is initialized with weights from the 'bert-base-uncased' checkpoint
# 'num_labels=9' specifies the number of output labels for token classification
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Installing necessary packages
!pip install accelerate -U




In [19]:
# Installing necessary packages
!pip install transformers[torch]




In [20]:
#Defining  training args
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
"NER",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
weight_decay=0.01,
)

In [21]:
# Applying Datacollator necessary in preparing data for training BERT model
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
# Load the Seqeval metric from the datasets library
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [23]:
# Extract label list from the dataset
label_list = conllpp["train"].features["ner_tags"].feature.names

# Print the label list
print(label_list)


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [24]:
# Unpack the evaluation predictions into predicted logits and true labels
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }


In [25]:
# The fine-tuned model to be trained
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [26]:
# Initiating the training process
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2213,0.05723,0.931615,0.934221,0.932916,0.98432
2,0.0452,0.055958,0.937979,0.944065,0.941012,0.985718
3,0.0251,0.056653,0.94068,0.94731,0.943983,0.986576
4,0.0142,0.061288,0.936977,0.951337,0.944102,0.986433
5,0.0097,0.06386,0.939954,0.950889,0.94539,0.986624


TrainOutput(global_step=4390, training_loss=0.050713736647082355, metrics={'train_runtime': 877.8515, 'train_samples_per_second': 79.974, 'train_steps_per_second': 5.001, 'total_flos': 1702317283240608.0, 'train_loss': 0.050713736647082355, 'epoch': 5.0})

In [27]:
# Saving model
model.save_pretrained("NER_MODEL")

In [28]:
# Saving tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [29]:
# Create id to label mapping
id2label = {
    str(idx): label for idx, label in enumerate(label_list)
}

# Create label to id mapping
label2id = {
    label: str(idx) for idx, label in enumerate(label_list)
}


In [30]:
# Displaying id2label and label2id
id2label, label2id

({'0': 'O',
  '1': 'B-PER',
  '2': 'I-PER',
  '3': 'B-ORG',
  '4': 'I-ORG',
  '5': 'B-LOC',
  '6': 'I-LOC',
  '7': 'B-MISC',
  '8': 'I-MISC'},
 {'O': '0',
  'B-PER': '1',
  'I-PER': '2',
  'B-ORG': '3',
  'I-ORG': '4',
  'B-LOC': '5',
  'I-LOC': '6',
  'B-MISC': '7',
  'I-MISC': '8'})

In [31]:
# Load model and Prediction
import json

# Load existing configuration from file
with open("NER_MODEL/config.json", "r") as config_file:
    config = json.load(config_file)

# Update configuration with id to label and label to id mappings
config["id2label"] = id2label
config["label2id"] = label2id

# Write updated configuration back to file
with open("NER_MODEL/config.json", "w") as config_file:
    json.dump(config, config_file)


In [32]:
# Load the fine-tuned token classification model from the "NER_MODEL"
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("NER_MODEL")


In [33]:
from transformers import pipeline

# Create a pipeline for Named Entity Recognition (NER)
ner_pipeline = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

# Test sentence
test_sentence = "Steve Jobs is the founder of Apple which is headquatered in Mountain view California and known for its innovative technology."

# Perform NER on the test sentence
ner_results = ner_pipeline(test_sentence)

print(ner_results)


[{'entity': 'B-PER', 'score': 0.99846554, 'index': 1, 'word': 'steve', 'start': 0, 'end': 5}, {'entity': 'I-PER', 'score': 0.9983109, 'index': 2, 'word': 'jobs', 'start': 6, 'end': 10}, {'entity': 'B-ORG', 'score': 0.9933189, 'index': 7, 'word': 'apple', 'start': 29, 'end': 34}, {'entity': 'B-LOC', 'score': 0.9975793, 'index': 14, 'word': 'mountain', 'start': 60, 'end': 68}, {'entity': 'I-LOC', 'score': 0.9520391, 'index': 15, 'word': 'view', 'start': 69, 'end': 73}, {'entity': 'I-LOC', 'score': 0.9785978, 'index': 16, 'word': 'california', 'start': 74, 'end': 84}]
