In [1]:
# Install required packages
!pip install transformers datasets tokenizers seqeval -q

In [2]:
# Import necessary libraries
import datasets
import numpy as np
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import json
from transformers import pipeline

In [3]:
# Load the CoNLL 2003 NER dataset
conll2003 = datasets.load_dataset("conll2003")

In [4]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
conll2003.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [6]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [8]:
conll2003['train'].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [9]:
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [10]:
#Below cell are just for checking the output of some variables before applying `tokenize_and_align_labels()`
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

''' As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to None and all other tokens to their respective word. This way, we can align the labels with the processed input ids. '''

tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
#The  input ids returned by the tokenizer are longer than the lists of labels our dataset contain.
len(example_text['ner_tags']), len(tokenized_input["input_ids"])
# (9, 11)

(9, 11)

In [12]:
# Tokenize and align labels with tokens
def tokenize_and_align_labels(examples, label_all_tokens=True):
    # Function to tokenize and align labels with tokens
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Set special tokens' labels to -100
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Set label of the first token of a word
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)  # Mask subwords
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
q = tokenize_and_align_labels(conll2003['train'][4:5]) 
print(q) 

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [14]:
# Before applying the `tokenize_and_align_labels()` the `tokenized_input` has 3 keys 
# - input_ids
# - token_type_ids
# - attention_mask

# But after applying `tokenize_and_align_labels()` we have an extra key - `'labels'`
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [15]:
# Apply the tokenization and alignment to the dataset
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

In [16]:
# Load the pre-trained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [19]:
# Define training arguments
args = TrainingArguments(
    "test-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [20]:
# Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [21]:
# Load metrics for evaluation
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")


In [22]:
example = conll2003['train'][0]

In [23]:
label_list = conll2003["train"].features["ner_tags"].feature.names 

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [24]:
labels = [label_list[i] for i in example["ner_tags"]] 

metric.compute(predictions=[labels], references=[labels]) 

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [25]:
# Compute metrics function
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    predictions = [
        [label_list[pred] for (pred, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]
    true_labels = [
        [label_list[l] for (pred, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [26]:
# Create a Trainer instance
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [27]:
# Train the model
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0781,0.061235,0.915701,0.936906,0.926182,0.982811
2,0.039,0.057691,0.933828,0.944065,0.938919,0.985782
3,0.0214,0.060713,0.939736,0.948988,0.944339,0.986544


TrainOutput(global_step=5268, training_loss=0.06612263734175929, metrics={'train_runtime': 21936.0521, 'train_samples_per_second': 1.92, 'train_steps_per_second': 0.24, 'total_flos': 895125623975100.0, 'train_loss': 0.06612263734175929, 'epoch': 3.0})

In [28]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("ner_model")
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [29]:
# Save id-to-label and label-to-id mappings
id2label = {str(i): label for i, label in enumerate(label_list)}
label2id = {label: str(i) for i, label in enumerate(label_list)}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json", "w"))

In [30]:
# Load the fine-tuned model
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [31]:
# Create an NER pipeline using the fine-tuned model and tokenizer
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

No CUDA runtime is found, using CUDA_HOME='C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6'


In [32]:
# Example text for NER
example = "Bill Gates is the Founder of Microsoft"

In [33]:
# Perform NER
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.9946613, 'index': 1, 'word': 'bill', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.99673957, 'index': 2, 'word': 'gates', 'start': 5, 'end': 10}, {'entity': 'B-ORG', 'score': 0.9900035, 'index': 7, 'word': 'microsoft', 'start': 29, 'end': 38}]


O means the word doesn’t correspond to any entity. <br>
B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity. <br>
B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.<br>
B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.<br>
B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.

### Launching Gradio interface

In [34]:
pip install gradio




In [35]:
pip install pydantic==1.8.2


Collecting pydantic==1.8.2
  Using cached pydantic-1.8.2-py3-none-any.whl (126 kB)
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.2.1
    Uninstalling pydantic-2.2.1:
      Successfully uninstalled pydantic-2.2.1
Successfully installed pydantic-1.8.2
Note: you may need to restart the kernel to use updated packages.


In [36]:
!pip install gradio -U
!pip install pydantic -U




Collecting pydantic
  Obtaining dependency information for pydantic from https://files.pythonhosted.org/packages/fd/35/86b1e7571e695587df0ddf2937100436dce0caa277d2f016d4e4f7d3791a/pydantic-2.2.1-py3-none-any.whl.metadata
  Using cached pydantic-2.2.1-py3-none-any.whl.metadata (145 kB)
Using cached pydantic-2.2.1-py3-none-any.whl (373 kB)
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.8.2
    Uninstalling pydantic-1.8.2:
      Successfully uninstalled pydantic-1.8.2
Successfully installed pydantic-2.2.1


In [37]:
from typing_extensions import TypeAlias

In [38]:
 pip install typing-extensions --upgrade




In [39]:
import gradio as gr

In [40]:
from transformers import AutoModelForTokenClassification, BertTokenizerFast, pipeline

In [41]:
# Load the fine-tuned model and tokenizer
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")

In [42]:
# Create an NER pipeline using the fine-tuned model and tokenizer
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

In [43]:
# Define the function that performs NER using the fine-tuned model
def named_entity_recognition(text_input):
    ner_results = nlp(text_input)
    return ner_results

In [44]:
# Create a Gradio interface
iface = gr.Interface(
    fn=named_entity_recognition,
    inputs=gr.inputs.Textbox(),  # Text input component
    outputs=gr.outputs.Textbox()  # Text output component
)

  inputs=gr.inputs.Textbox(),  # Text input component
  inputs=gr.inputs.Textbox(),  # Text input component
  inputs=gr.inputs.Textbox(),  # Text input component
  outputs=gr.outputs.Textbox()  # Text output component


In [47]:
# Launch the Gradio interface
iface.launch(share='true')

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on public URL: https://acb84f1afb5817dedd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


