In [30]:
!pip uninstall -y wandb
!pip install -U "transformers>=4.40.0" "datasets>=2.18.0" seqeval accelerate




In [31]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from seqeval.metrics import f1_score, precision_score, recall_score
import numpy as np

In [32]:
print(" Loading RONEC dataset...")
dataset = load_dataset("community-datasets/ronec")
print(dataset)

print("\n Example from train split:")
print(dataset["train"][0])

üì• Loading RONEC dataset...
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_ids', 'space_after', 'ner_tags'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_ids', 'space_after', 'ner_tags'],
        num_rows: 1330
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_ids', 'space_after', 'ner_tags'],
        num_rows: 2000
    })
})

üîé Example from train split:
{'id': 4896, 'tokens': ['-', 'Iohannis', ',', 'Klaus', 'Vacan»õƒÉ', 'Iohannis', '-', 'cƒÉruia', 'pu»õin', '√Æi', 'pasƒÉ', 'cƒÉ', '40%', 'din', 'popula»õia', '»õƒÉrii', 'trƒÉie»ôte', 'sub', 'limita', 'sƒÉrƒÉciei', 'altfel', 'nu', 'ar', 'putea', 'sƒÉ', 'sus»õinƒÉ', 'sifonarea', 'profiturilor', 'de', 'cƒÉtre', 'multina»õionalele', 'strƒÉine', '»ôi', 'ar', 'sus»õine', 'impozitul', 'pe', 'venit', '-', 'BƒÉsescu', ',', 'Traian', 'BƒÉsescu', 'care', 'a', 'spus', 'cƒÉ', 'aplicarea', 'unui', 'impozit', 'pe', 'venit', '»õine', 'de', '‚Äû', 'Paleo

In [33]:
label_names = dataset["train"].features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in enumerate(label_names)}

print("\n Labels:", label_names)
print("Number of labels:", len(label_names))


‚úÖ Labels: ['O', 'B-PERSON', 'I-PERSON', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-NAT_REL_POL', 'I-NAT_REL_POL', 'B-EVENT', 'I-EVENT', 'B-LANGUAGE', 'I-LANGUAGE', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-DATETIME', 'I-DATETIME', 'B-PERIOD', 'I-PERIOD', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-NUMERIC', 'I-NUMERIC', 'B-ORDINAL', 'I-ORDINAL', 'B-FACILITY', 'I-FACILITY']
Number of labels: 31


In [34]:
model_name = "bert-base-multilingual-cased"

print(f"\nüì• Loading tokenizer and model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
)


üì• Loading tokenizer and model: bert-base-multilingual-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model and tokenizer loaded.


In [36]:
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(labels[word_id])
            else:
                label_ids.append(labels[word_id])
            previous_word_id = word_id

        all_labels.append(label_ids)

    tokenized["labels"] = all_labels
    return tokenized

print("\n‚öôÔ∏è Tokenizing and aligning labels (this may take a bit)...")
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
print("‚úÖ Tokenization done.")


‚öôÔ∏è Tokenizing and aligning labels (this may take a bit)...


Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

‚úÖ Tokenization done.


In [37]:

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred_ids, label_ids in zip(predictions, labels):
        cur_labels = []
        cur_preds = []
        for p, l in zip(pred_ids, label_ids):
            if l == -100:
                continue
            cur_labels.append(label_names[l])
            cur_preds.append(label_names[p])
        true_labels.append(cur_labels)
        true_preds.append(cur_preds)

    precision = precision_score(true_labels, true_preds)
    recall = recall_score(true_labels, true_preds)
    f1 = f1_score(true_labels, true_preds)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

print("\n‚úÖ Metrics function ready.")


‚úÖ Metrics function ready.


In [38]:

training_args = TrainingArguments(
    output_dir="ner_baseline_ronec",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
)

print("‚úÖ TrainingArguments set.")


‚úÖ TrainingArguments set.


In [39]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



  trainer = Trainer(


‚úÖ Trainer created.


In [None]:
train_output = trainer.train()
print(train_output)




Step,Training Loss


In [None]:
metrics_e1 = trainer.evaluate(tokenized_dataset["test"])
print(metrics_e1)
