# Exemplo de Fine-tuning de QA

Este notebook mostra como carregar o dataset, filtrar por tópico e idioma, e treinar um modelo de perguntas e respostas utilizando Hugging Face.

In [None]:
!pip install -q transformers datasets

In [None]:
from datasets import load_from_disk, Dataset
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer)

# Carregar dataset salvo no formato Hugging Face
dataset = load_from_disk('datasets_wikipedia_pro/huggingface')

In [None]:
# Filtrar por tópico e idioma
filtered = dataset.filter(lambda x: x['topic'] == 'Technology' and x['language'] == 'en')

In [None]:
# Transformar registros em pares pergunta/contexto/resposta
records = []
for rec in filtered:
    for q, a in zip(rec['questions'], rec['answers']):
        start = rec['content'].find(a)
        records.append({'question': q, 'context': rec['content'], 'answers': {'text':[a],'answer_start':[start]}})
qa_ds = Dataset.from_list(records)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')

In [None]:
def preprocess(batch):
    return tokenizer(batch['question'], batch['context'], truncation=True, padding='max_length')

encoded = qa_ds.map(preprocess, batched=True)

In [None]:
args = TrainingArguments('qa_model', per_device_train_batch_size=2, num_train_epochs=1)
trainer = Trainer(model=model, args=args, train_dataset=encoded)
trainer.train()