In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
from evaluate import load  # Replaces load_metric
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [3]:
train_data = read_conll_file('resource/eng.train')
validation_data = read_conll_file('resource/eng.testa')
test_data = read_conll_file('resource/eng.testb')

In [4]:
train_data[:2]

[[['-DOCSTART-', '-X-', '-X-', 'O']],
 [['EU', 'NNP', 'B-NP', 'B-ORG'],
  ['rejects', 'VBZ', 'B-VP', 'O'],
  ['German', 'JJ', 'B-NP', 'B-MISC'],
  ['call', 'NN', 'I-NP', 'O'],
  ['to', 'TO', 'B-VP', 'O'],
  ['boycott', 'VB', 'I-VP', 'O'],
  ['British', 'JJ', 'B-NP', 'B-MISC'],
  ['lamb', 'NN', 'I-NP', 'O'],
  ['.', '.', 'O', 'O']]]

In [5]:
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)

In [6]:
label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

In [10]:
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [11]:
train_dataset.to_csv('train_dataset.csv', index=False)


Creating CSV from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 14.37ba/s]


2034950

In [8]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 14987
})

In [9]:
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [10]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

Map: 100%|██████████| 14987/14987 [00:01<00:00, 11511.04 examples/s]
Map: 100%|██████████| 3466/3466 [00:00<00:00, 11082.44 examples/s]
Map: 100%|██████████| 3684/3684 [00:00<00:00, 11364.63 examples/s]


In [14]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.0786,0.062471,0.908697,0.917873,0.913262,precision recall f1-score support  LOC 0.93 0.96 0.95 1837  MISC 0.79 0.84 0.82 922  ORG 0.92 0.84 0.88 1341  PER 0.94 0.97 0.96 1842  micro avg 0.91 0.92 0.91 5942  macro avg 0.90 0.90 0.90 5942 weighted avg 0.91 0.92 0.91 5942
1000,0.0475,0.058259,0.922437,0.932683,0.927531,precision recall f1-score support  LOC 0.91 0.97 0.94 1837  MISC 0.88 0.82 0.85 922  ORG 0.90 0.90 0.90 1341  PER 0.96 0.97 0.97 1842  micro avg 0.92 0.93 0.93 5942  macro avg 0.92 0.92 0.92 5942 weighted avg 0.92 0.93 0.93 5942
1500,0.0384,0.046144,0.940998,0.942107,0.941552,precision recall f1-score support  LOC 0.97 0.96 0.96 1837  MISC 0.87 0.89 0.88 922  ORG 0.91 0.92 0.91 1341  PER 0.97 0.97 0.97 1842  micro avg 0.94 0.94 0.94 5942  macro avg 0.93 0.93 0.93 5942 weighted avg 0.94 0.94 0.94 5942


TrainOutput(global_step=1874, training_loss=0.07485742769984324, metrics={'train_runtime': 11669.4475, 'train_samples_per_second': 1.284, 'train_steps_per_second': 0.161, 'total_flos': 960565719981294.0, 'train_loss': 0.07485742769984324, 'epoch': 1.0})

In [17]:
trainer.save_model("ner_model")

In [4]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Define the model path (where you saved it using trainer.save_model)
model_path = "ner_model"

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Force the model to run on CPU
device = torch.device("cpu")
model.to(device)
model.eval()  # Set to evaluation mode


  from .autonotebook import tqdm as notebook_tqdm


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

train_data = read_conll_file('resource/eng.train')
validation_data = read_conll_file('resource/eng.testa')
test_data = read_conll_file('resource/eng.testb')

label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

In [20]:
sentence = "John Smith graduated from MIT in 2010."
tokenized_input = tokenizer(sentence, return_tensors="pt").to(model.device)
outputs = model(**tokenized_input)
predicted_labels = outputs.logits.argmax(-1)[0]
named_entities = [tokenizer.decode([token]) for token, label in zip(tokenized_input["input_ids"][0], predicted_labels) if label != 0 and label != label_map['O']]
print("Named Entities - Example 1:", named_entities)

Named Entities - Example 1: ['John', 'Smith', 'MIT']
