In [1]:
from datasets import Dataset
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_conll_local(file_path):
    sentences = []
    labels = []
    tokens = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence_tokens = []
        sentence_labels = []
        for line in f:
            line = line.strip()
            if not line:
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens = []
                    sentence_labels = []
            else:
                splits = line.split('\t')
                if len(splits) == 2:
                    token, label = splits
                    sentence_tokens.append(token)
                    sentence_labels.append(label)
        if sentence_tokens:
            tokens.append(sentence_tokens)
            labels.append(sentence_labels)
    df = pd.DataFrame({'tokens': tokens, 'ner_tags': labels})
    return Dataset.from_pandas(df)

dataset = load_conll_local('../data/labeled_subset.conll')

In [4]:
# STEP 3: Tokenize and Align Labels
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_list = ["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for i, l in enumerate(label_list)}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                if label[word_idx].startswith("B-"):
                    label_ids.append(label_to_id[label[word_idx].replace("B-", "I-")])
                else:
                    label_ids.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 50/50 [00:00<00:00, 5533.09 examples/s]


In [5]:
# STEP 4: Load Model for Token Classification
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list)
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/xlm-roberta-base/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/xlm-roberta-base/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/xlm-roberta-base/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate develop

In [20]:
# STEP 5: Define Metrics
import evaluate
metric = evaluate.load("seqeval")

def compute_metrics(p):
    import numpy as np
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [24]:
# STEP 6: Set Training Arguments and Use Trainer API
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# STEP 7: Save the Model
trainer.save_model("./fine_tuned_ner_model")


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [19]:
pip install --upgrade transformers datasets evaluate

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
