# Exemplo de Treinamento NER

Este notebook mostra como carregar o dataset via Hugging Face e treinar um modelo simples de reconhecimento de entidades.

In [None]:
!pip install -q transformers datasets

In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer)

In [None]:
dataset = load_dataset('conll2003')
label_list = dataset['train'].features['ner_tags'].feature.names

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def tokenize_and_align(example):
    tokenized = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    labels = []
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example['ner_tags'][word_id])
        else:
            labels.append(example['ner_tags'][word_id] if tokenizer.is_fast else -100)
        prev_word_id = word_id
    tokenized['labels'] = labels
    return tokenized

In [None]:
tokenized_datasets = dataset.map(tokenize_and_align, batched=False)

In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./models/ner',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy='epoch',
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_strategy='no'
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'].select(range(200)),
    eval_dataset=tokenized_datasets['validation'].select(range(200)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics