# Modelado de DistilBERT base

In [1]:
# Install libraries
!pip install transformers datasets evaluate tokenizers accelerate
!pip install torch # or torch with GPU support if you have CUDA
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


## Paso 1: Cargar las librerías

In [2]:
import numpy as np
import evaluate
import requests
import json

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Paso 2: Cargar los datos

In [7]:
# URLs of the files
urls = {
    "eval_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_colombia_mexico_dataset.json",
    "train_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_colombia_mexico_dataset.json"
}

# Dictionary to store the loaded JSON data
datasets = {}

for filename, url in urls.items():
    # Download the file
    response = requests.get(url)
    if response.status_code == 200:
        # Save locally
        with open(filename, "wb") as f:
            f.write(response.content)
        # Load JSON into Python
        datasets[filename] = response.json()
        print(f"{filename} downloaded and loaded successfully!")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

eval_colombia_mexico_dataset.json downloaded and loaded successfully!
train_colombia_mexico_dataset.json downloaded and loaded successfully!


In [10]:
# 🔧 Flatten your dataset so each row has 'context' and 'qas'
def flatten_squad(dataset):
    new_data = []
    for article in dataset:
        for para in article["paragraphs"]:
            new_data.append({
                "context": para["context"],
                "qas": para["qas"]
            })
    return new_data

In [11]:
# Example: load local dataset
import json

# Load local JSON files
with open("train_colombia_mexico_dataset.json") as f:
    train_data = json.load(f)

with open("eval_colombia_mexico_dataset.json") as f:
    eval_data = json.load(f)

# Apply flattening
train_flat = flatten_squad(train_data["data"])
eval_flat = flatten_squad(eval_data["data"])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_flat)
eval_dataset = Dataset.from_list(eval_flat)

## Paso 3: Cargar el tokenizador y el modelo

In [None]:
model_name = "mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

## Paso 4: Preprocesar el conjunto de datos

In [12]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answers"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Map start/end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(inputs["offset_mapping"]):
        answer = answers[i]
        answer_text = answer["text"][0] if len(answer["text"]) > 0 else ""
        answer_start = answer["answer_start"][0] if len(answer["answer_start"]) > 0 else 0
        answer_end = answer_start + len(answer_text)

        # Find token start/end
        token_start_index = 0
        token_end_index = 0
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                token_start_index = idx
            if start < answer_end <= end:
                token_end_index = idx
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

## Paso 5: Cargar las métricas de SQuAD

In [13]:
metric = evaluate.load("squad")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

## Paso 6: Definir una función de posprocesamiento

In [14]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20):
    all_start_logits, all_end_logits = raw_predictions
    predictions = []

    # For each example
    for i, example in enumerate(examples):
        feature = features[i]
        start_logits = all_start_logits[i]
        end_logits = all_end_logits[i]

        # Get top start and end indexes
        start_indexes = np.argsort(start_logits)[-n_best_size:]
        end_indexes = np.argsort(end_logits)[-n_best_size:]

        valid_answers = []
        context = example["context"]

        for start_index in start_indexes:
            for end_index in end_indexes:
                if start_index <= end_index:
                    start_char = feature["offset_mapping"][start_index][0]
                    end_char = feature["offset_mapping"][end_index][1]
                    valid_answers.append({
                        "text": context[start_char:end_char],
                        "logit_score": start_logits[start_index] + end_logits[end_index]
                    })

        if valid_answers:
            best_answer = max(valid_answers, key=lambda x: x["logit_score"])
            predictions.append({"id": example["id"], "prediction_text": best_answer["text"]})
        else:
            predictions.append({"id": example["id"], "prediction_text": ""})

    return predictions


## Paso 7: Definir la función compute_metrics para Trainer

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions_text = postprocess_qa_predictions(
        eval_dataset, eval_dataset, predictions
    )

    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_dataset]

    return metric.compute(predictions=predictions_text, references=references)

## Paso 5: Entrenamiento con el Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-qa-spanish",        # main output directory
    overwrite_output_dir=True,             # overwrite old checkpoints

    # Training
    per_device_train_batch_size=8,         # matches previous 8
    per_device_eval_batch_size=8,          # matches previous 8
    gradient_accumulation_steps=1,         # can increase if VRAM limited
    learning_rate=2e-5,                    # matches previous 2e-5
    num_train_epochs=4,                     # matches previous 4
    weight_decay=0.01,                     # regularization
    warmup_steps=500,                      # optional for smoother training
    fp16=True,                             # mixed precision for faster training

    # Evaluation
    eval_strategy="steps",                 # evaluate every N steps
    eval_steps=500,                        # same as previous
    save_strategy="steps",                 # save every N steps
    save_steps=500,                        # save every 500 steps
    save_total_limit=2,                    # keep last 2 checkpoints

    # Best model
    load_best_model_at_end=True,           # load best model after training
    metric_for_best_model="f1",            # use F1 to select best
    greater_is_better=True,                # higher F1 is better
    # Optional: log metrics for monitoring
    logging_dir="./logs",
    logging_steps=100,
    push_to_hub=True                        # upload to Hugging Face Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

## Paso 6: Subir al Hub de Hugging Face

In [None]:
trainer.push_to_hub(
    "luigui/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    description="BERT-base Spanish WWM cased model fine-tuned for extractive QA on news articles from Colombia and Mexico.",
    tags=["spanish", "qa", "news", "colombia", "mexico", "bert-base", "wwm", "cased"]
)