## Imports

In [None]:
# This is a Named Entity Recognition (NER) project using two methods:
# Using DistilBERT model from Hugging Face transformers library.
# The code loads and preprocesses data, trains a NER model, evaluates its performance, and saves the trained model for future use.
# The dataset used is CoNLL-2003.

In [None]:
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


## Load and check Data

In [3]:
raw_datasets = load_dataset("eriktks/conll2003",revision="convert/parquet")

In [4]:
# Quick dataset checks
print("Dataset keys:", raw_datasets.keys())
print("Train set size:", len(raw_datasets['train']))
print("Validation set size:", len(raw_datasets['validation']))
print("Test set size:", len(raw_datasets['test']))
print("\nFirst example from train set:")
print(raw_datasets['train'][0])

Dataset keys: dict_keys(['train', 'validation', 'test'])
Train set size: 14041
Validation set size: 3250
Test set size: 3453

First example from train set:
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [5]:
label_names = raw_datasets["train"].features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in enumerate(label_names)}

## Load and define DistilBert

In [6]:
# Train DistilBERT NER model

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize_align_labels(examples):
    inputs_tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []

    for i, l in enumerate(examples[f"ner_tags"]):
        w_ids = inputs_tokenized.word_ids(batch_index=i)
        prev_w_idx = None
        l_ids = []
        for w_idx in w_ids:
            if w_idx is None:
                l_ids.append(-100)
            elif w_idx != prev_w_idx:
                l_ids.append(l[w_idx])
            else:
                l_ids.append(-100)
            prev_w_idx = w_idx

        labels.append(l_ids)

    inputs_tokenized["labels"] = labels
    return inputs_tokenized

tokenized_datasets = raw_datasets.map(tokenize_align_labels, batched=True)

Map: 100%|██████████| 3453/3453 [00:00<00:00, 4026.64 examples/s]


In [8]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_names), id2label=id2label, label2id=label2id)  # 9 is the number of unique NER tags

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model evaluate metrics

In [9]:
seqeval = load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    true_preds = [[label_names[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]

    results = seqeval.compute(predictions=true_preds, references=true_labels)

    return {
        "accuracy": results["overall_accuracy"],
        "f1": results["overall_f1"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
    }

## Training args and Run

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True, # Enable mixed precision training if GPU is available
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Stop training if no improvement in eval metrics for 3 consecutive evaluations
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0671,0.053235,0.984346,0.918444,0.913022,0.923931
2,0.0377,0.049996,0.986994,0.935441,0.929688,0.941266
3,0.0228,0.050653,0.987286,0.935665,0.930294,0.941097




TrainOutput(global_step=2634, training_loss=0.06830440229688889, metrics={'train_runtime': 2907.4145, 'train_samples_per_second': 14.488, 'train_steps_per_second': 0.906, 'total_flos': 510122266253334.0, 'train_loss': 0.06830440229688889, 'epoch': 3.0})

## Load the model and test

In [5]:
# Load the trained model and tokenizer for inference
model_path = "./results/checkpoint/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [6]:
# Inference function
def ner_inference(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(), axis=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[pred] for pred in predictions[0]]

    ner_results = []
    for token, label in zip(tokens, predicted_labels):
        if label != "O":
            ner_results.append((token, label))

    return ner_results

# Example usage
sample_text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
ner_results = ner_inference(sample_text)

In [7]:
ner_results

[('hugging', 'B-ORG'),
 ('face', 'I-ORG'),
 ('inc', 'I-ORG'),
 ('new', 'B-LOC'),
 ('york', 'I-LOC'),
 ('city', 'I-LOC'),
 ('dumb', 'B-LOC'),
 ('##o', 'I-LOC'),
 ('manhattan', 'B-LOC'),
 ('bridge', 'I-LOC')]