In [13]:
# !pip install -q transformers seqeval
# !pip install -U datasets

In [14]:
from datasets import load_dataset, Dataset

dataset = load_dataset("gtfintechlab/finer-ord")

In [15]:
## Looking at what sentences look like
from collections import defaultdict

def loadsentences(dataset_split):
  grouped = defaultdict(lambda: {"tokens": [], "labels": []})
  for item in dataset_split:
    key = (item["doc_idx"], item["sent_idx"])
    grouped[key]["tokens"].append(item["gold_token"])
  return grouped

test = loadsentences(dataset["train"])

for i in range(20, 25):
  print(f"\n\nSentence {i}:\n")
  print(
      ' '.join(test[(0,i)]['tokens'])
    )




Sentence 20:

Billionaire businessman Chris Kirubi said that he expected the summit to translate into increased dealings between Kenya and the US and capital inflows .


Sentence 21:

KAM said the recent renewal of the African Growth and Opportunity Act ( Agoa ) , which allows exports to the US duty free , would help balance Kenya 's global trade .


Sentence 22:

" We have in the past not utilised the Agoa opportunity to its full potential .


Sentence 23:

We have only been strong on the textile front .


Sentence 24:

It 's high time we leveraged on the remaining 6400 products that enjoy tax - free access to the US market , " said Ms Wakiaga adding that Kenya needs to invest in value addition to sharpen its competitiveness even in the absence of such preference treatment .


In [16]:
## Collect tokens
def group_tokens(dataset_split):
  grouped = defaultdict(lambda: {"tokens": [], "labels": []})
  for item in dataset_split:
    key = (item["doc_idx"], item["sent_idx"])
    grouped[key]["tokens"].append(item["gold_token"])
    grouped[key]["labels"].append(item["gold_label"])
  return [{"tokens": v["tokens"], "labels": v["labels"]} for v in grouped.values()]


In [17]:
grouped_train = group_tokens(dataset["train"])
train_dataset = Dataset.from_list(grouped_train)

grouped_test = group_tokens(dataset["test"])
test_dataset = Dataset.from_list(grouped_test)

grouped_val = group_tokens(dataset["validation"])
val_dataset = Dataset.from_list(grouped_val)

In [18]:
label_list = list({'O': 0, 'PER_B': 1, 'PER_I': 2, 'LOC_B': 3, 'LOC_I': 4, 'ORG_B': 5, 'ORG_I': 6}.keys())

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels = len(label_list), ignore_mismatched_sizes = True)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def tokenize_and_align_labels(input):
    tokenized_inputs = tokenizer(
        input["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(input["labels"][word_idx])
        else:
            # Mask subword tokens
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs


In [21]:
def is_valid(example):
    return all(isinstance(token, str) for token in example["tokens"])

train_dataset = train_dataset.filter(is_valid)
test_dataset = test_dataset.filter(is_valid)
val_dataset = val_dataset.filter(is_valid)
tokenized_train =  train_dataset.map(tokenize_and_align_labels, batched = False)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched = False)
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched = False)

Filter:   0%|          | 0/3262 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1075 [00:00<?, ? examples/s]

Filter:   0%|          | 0/402 [00:00<?, ? examples/s]

Map:   0%|          | 0/3261 [00:00<?, ? examples/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

In [22]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, classification_report

In [24]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

training_args = TrainingArguments(
    output_dir="ner-bert-base",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    logging_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1308,0.068065,0.979722,0.861279
2,0.0312,0.066255,0.981958,0.878493
3,0.0162,0.07204,0.98215,0.88101




TrainOutput(global_step=1224, training_loss=0.05939313983605578, metrics={'train_runtime': 168.2259, 'train_samples_per_second': 58.154, 'train_steps_per_second': 7.276, 'total_flos': 272334565285992.0, 'train_loss': 0.05939313983605578, 'epoch': 3.0})

In [25]:
trainer.evaluate()



{'eval_loss': 0.07204043865203857,
 'eval_accuracy': 0.9821504298546591,
 'eval_f1': 0.8810096153846153,
 'eval_runtime': 3.6301,
 'eval_samples_per_second': 296.133,
 'eval_steps_per_second': 37.189,
 'epoch': 3.0}

In [35]:
import torch

# Align device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

example = val_dataset[10]
tokens = example["tokens"]

encoding = tokenizer(tokens, return_tensors = "pt", is_split_into_words = True)
word_ids = encoding.word_ids()

inputs = {k: v.to(device) for k, v in encoding.items()}

with torch.no_grad():
  outputs = model(**inputs)

logits = outputs.logits
predictions = logits.argmax(-1)

## Convert predictions back into word-level labels
predicted_labels = []
previous_word_id = None

for idx, word_id in enumerate(word_ids):
  if word_id is None:
    continue
  if word_id != previous_word_id:
    label_id = predictions[0][idx].item()
    predicted_labels.append(label_list[label_id])
    previous_word_id = word_id

## Get ground truth labels
true_labels = [label_list[i] for i in example["labels"]]

## Print token-wise comparison
for token, pred, true in zip(tokens, predicted_labels, true_labels):
    print(f"{token:12s} | Pred: {pred:10s} | True: {true}")


Australian   | Pred: O          | True: O
stocks       | Pred: O          | True: O
(            | Pred: O          | True: O
.            | Pred: O          | True: ORG_B
AXJO         | Pred: O          | True: ORG_I
)            | Pred: O          | True: O
lost         | Pred: O          | True: O
0.4          | Pred: O          | True: O
percent      | Pred: O          | True: O
as           | Pred: O          | True: O
worries      | Pred: O          | True: O
about        | Pred: O          | True: O
China        | Pred: LOC_B      | True: LOC_B
weighed      | Pred: O          | True: O
on           | Pred: O          | True: O
mining       | Pred: O          | True: O
and          | Pred: O          | True: O
energy       | Pred: O          | True: O
stocks       | Pred: O          | True: O
following    | Pred: O          | True: O
Monday's     | Pred: O          | True: O
China        | Pred: LOC_B      | True: LOC_B
GDP          | Pred: O          | True: O
data         | Pre