# Modelado de DistilBERT base

In [1]:
# Install libraries
!pip install transformers datasets evaluate tokenizers accelerate
!pip install torch # or torch with GPU support if you have CUDA
!pip install evaluate



## Paso 1: Cargar las librerías

In [2]:
import numpy as np
import evaluate
import requests
import json


from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login
from datasets import Dataset

In [3]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available!")
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("❌ GPU is not available.")


✅ GPU is available!
GPU name: NVIDIA A100-SXM4-80GB


In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Paso 2: Cargar los datos

In [5]:
# URLs of the files
urls = {
    "eval_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_colombia_mexico_dataset.json",
    "train_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_colombia_mexico_dataset.json"
}

# Dictionary to store the loaded JSON data
datasets = {}

for filename, url in urls.items():
    # Download the file
    response = requests.get(url)
    if response.status_code == 200:
        # Save locally
        with open(filename, "wb") as f:
            f.write(response.content)
        # Load JSON into Python
        datasets[filename] = response.json()
        print(f"{filename} downloaded and loaded successfully!")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

eval_colombia_mexico_dataset.json downloaded and loaded successfully!
train_colombia_mexico_dataset.json downloaded and loaded successfully!


In [6]:
def flatten_squad_json(json_data):
    flat_data = {
        "context": [],
        "question": [],
        "answers": [],
        "id": [],
        "is_impossible": []
    }
    for article in json_data["data"]:
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                flat_data["context"].append(context)
                flat_data["question"].append(qa["question"])
                flat_data["answers"].append(qa["answers"])
                flat_data["id"].append(qa["id"])
                flat_data["is_impossible"].append(qa.get("is_impossible", False))
    return flat_data

In [7]:
# Example: load local dataset
import json

# Load local JSON files
with open("train_colombia_mexico_dataset.json") as f:
    train_data = json.load(f)

with open("eval_colombia_mexico_dataset.json") as f:
    eval_data = json.load(f)

# Apply flattening
train_flat = flatten_squad_json(train_data)
eval_flat = flatten_squad_json(eval_data)


train_dataset = Dataset.from_dict(train_flat)
eval_dataset = Dataset.from_dict(eval_flat)


In [8]:
train_dataset[1]

{'context': '. Este espíritu se inclinó por el camino de la luz difundida en el metauniverso sonoro. Tras pasar los primeros años de su adolescencia en la capital de Illinois, se mudó a San Francisco para perfeccionar sus estudios musicales. Ahí conoció a Blaine L. Reininger con quien creó Tuxedomoon, conjunto de experimentadores que en su tiempo fueron considerados como vanguardistas. Dejó Estados Unidos a principios de la década de 1980. Viró a Europa y se estableció en Bruselas (Bélgica) por 12 años. Luego de un viaje a México y tras degustar la naturaleza de nuestro país, pareció recordar su raíz real y decidió no dejarla. Era 1993. Primero estuvo en la Ciudad de México y luego se trasladó a Oaxaca (de donde sabía, sólo era tierra de artistas plásticos), pero la conoció y decidió ser parte de ella. Esteban Café o, Steven Brown es un hombre con la capacidad de ser invisible a través de la acústica (en sentido figurado), que hizo de estos terruños un laboratorio de creación. Ha forma

In [9]:
for i in range(3):
    print("ID:", train_dataset[i]["id"])
    print("Question:", train_dataset[i]["question"])
    print("Context snippet:", train_dataset[i]["context"][:200], "...")
    print("Answers:", train_dataset[i]["answers"])
    print("Is impossible:", train_dataset[i]["is_impossible"])
    print("-" * 50)

ID: qa-15742
Question: ¿Qué película musicalizó Steven Brown junto con la Banda Regional Mixe de Oaxaca?
Context snippet: . Este espíritu se inclinó por el camino de la luz difundida en el metauniverso sonoro. Tras pasar los primeros años de su adolescencia en la capital de Illinois, se mudó a San Francisco para perfecci ...
Answers: [{'answer_start': 2502, 'text': 'Qué Viva México'}]
Is impossible: False
--------------------------------------------------
ID: qa-15743
Question: ¿En qué año llegó Steven Brown a México?
Context snippet: . Este espíritu se inclinó por el camino de la luz difundida en el metauniverso sonoro. Tras pasar los primeros años de su adolescencia en la capital de Illinois, se mudó a San Francisco para perfecci ...
Answers: [{'answer_start': 626, 'text': '1993'}]
Is impossible: False
--------------------------------------------------
ID: qa-15741
Question: ¿Cuál es el nombre del conjunto que Steven Brown creó junto a Blaine L. Reininger?
Context snippet: . Este esp

## Paso 3: Cargar el tokenizador y el modelo

In [10]:
model_name = "mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Paso 4: Preprocesar el conjunto de datos

In [15]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation=True,
        padding="max_length",
    )

    start_positions = []
    end_positions = []

    # Handle the first (and only) answer
    answer = examples["answers"][0] if len(examples["answers"]) > 0 else {"text": [], "answer_start": 0}

    if len(answer["text"]) == 0:
        start_positions.append(0)
        end_positions.append(0)
    else:
        answer_start = answer["answer_start"]
        if isinstance(answer_start, list):  # in case it's a list
            answer_start = answer_start[0]

        answer_text = answer["text"]
        if isinstance(answer_text, list):
            answer_text = answer_text[0]

        answer_end = answer_start + len(answer_text)

        offsets = tokenized_inputs.get("offset_mapping", [(0, 0)] * len(tokenized_inputs["input_ids"]))

        # Find start token
        token_start_index = 0
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                token_start_index = idx
                break

        # Find end token
        token_end_index = len(offsets) - 1
        for idx, (start, end) in enumerate(offsets):
            if start < answer_end <= end:
                token_end_index = idx
                break

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    # Remove offsets to save memory
    tokenized_inputs.pop("offset_mapping", None)
    return tokenized_inputs


In [16]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=False,  # <-- important
    remove_columns=train_dataset.column_names
)

tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=eval_dataset.column_names
)


Map:   0%|          | 0/14613 [00:00<?, ? examples/s]

Map:   0%|          | 0/6263 [00:00<?, ? examples/s]

## Paso 5: Cargar las métricas de SQuAD

In [17]:
metric = evaluate.load("squad")

## Paso 6: Definir una función de posprocesamiento

In [18]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20):
    all_start_logits, all_end_logits = raw_predictions
    predictions = []

    # For each example
    for i, example in enumerate(examples):
        feature = features[i]
        start_logits = all_start_logits[i]
        end_logits = all_end_logits[i]

        # Get top start and end indexes
        start_indexes = np.argsort(start_logits)[-n_best_size:]
        end_indexes = np.argsort(end_logits)[-n_best_size:]

        valid_answers = []
        context = example["context"]

        for start_index in start_indexes:
            for end_index in end_indexes:
                if start_index <= end_index:
                    start_char = feature["offset_mapping"][start_index][0]
                    end_char = feature["offset_mapping"][end_index][1]
                    valid_answers.append({
                        "text": context[start_char:end_char],
                        "logit_score": start_logits[start_index] + end_logits[end_index]
                    })

        if valid_answers:
            best_answer = max(valid_answers, key=lambda x: x["logit_score"])
            predictions.append({"id": example["id"], "prediction_text": best_answer["text"]})
        else:
            predictions.append({"id": example["id"], "prediction_text": ""})

    return predictions


## Paso 7: Definir la función compute_metrics para Trainer

In [19]:
def compute_metrics(p):
    predictions, labels = p
    predictions_text = postprocess_qa_predictions(
        eval_dataset, eval_dataset, predictions
    )

    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_dataset]

    return metric.compute(predictions=predictions_text, references=references)

## Paso 5: Entrenamiento con el Trainer

In [20]:
training_args = TrainingArguments(
    output_dir="./bert-qa-spanish",        # main output directory
    overwrite_output_dir=True,             # overwrite old checkpoints

    # Training
    per_device_train_batch_size=8,         # matches previous 8
    per_device_eval_batch_size=8,          # matches previous 8
    gradient_accumulation_steps=1,         # can increase if VRAM limited
    learning_rate=2e-5,                    # matches previous 2e-5
    num_train_epochs=4,                     # matches previous 4
    weight_decay=0.01,                     # regularization
    warmup_steps=500,                      # optional for smoother training
    fp16=True,                             # mixed precision for faster training

    # Evaluation
    eval_strategy="steps",                 # evaluate every N steps
    eval_steps=500,                        # same as previous
    save_strategy="steps",                 # save every N steps
    save_steps=500,                        # save every 500 steps
    save_total_limit=2,                    # keep last 2 checkpoints

    # Best model
    load_best_model_at_end=True,           # load best model after training
    metric_for_best_model="f1",            # use F1 to select best
    greater_is_better=True,                # higher F1 is better
    # Optional: log metrics for monitoring
    logging_dir="./logs",
    logging_steps=100,
    push_to_hub=True                        # upload to Hugging Face Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mguillermo-luigui-nieto[0m ([33mluigui[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


## Paso 6: Subir al Hub de Hugging Face

In [None]:
trainer.push_to_hub(
    "luigui/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    description="BERT-base Spanish WWM cased model fine-tuned for extractive QA on news articles from Colombia and Mexico.",
    tags=["spanish", "qa", "news", "colombia", "mexico", "bert-base", "wwm", "cased"]
)