# Evaluación de Modelos con Conjunto de Datos Propio

## Paso 1: Cargar las Librerías

In [18]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import json

## Paso 2: Cargar los modelos

In [19]:
models = {
    "BERT (Base)": "mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",
    "DistilBERT (Base)": "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",
    "BERT (Fine-tuned News QA)": "BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    "DistilBERT (Fine-tuned News QA)": "BlueAutomata/distill-bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
}

## Paso 3: Cargar los conjuntos de datos

In [20]:
GOLD_PATH   = "../datasets/exploration_datasets/gold/"

In [21]:
datasets = {
    "Colombia and Mexico Train": GOLD_PATH + "train_colombia_mexico_dataset.json",
    "Colombia and Mexico Eval": GOLD_PATH + "eval_colombia_mexico_dataset.json",
    "Colombia Train": GOLD_PATH + "train_colombia_dataset.json",
    "Mexico Train": GOLD_PATH + "train_mexico_dataset.json",
    "Colombia Eval": GOLD_PATH + "eval_colombia_dataset.json",
    "Mexico Eval": GOLD_PATH + "eval_mexico_dataset.json"
}

## Paso 4: Cargar las métricas

In [22]:
# 🧮 Metric (SQuAD v2)
metric = evaluate.load("squad_v2")

## Paso 5: Transformar el conjunto de datos a español: Aplanar a formato Contexto + Preguntas y Respuestas (QA

In [23]:
def flatten_squad(dataset):
    # Handle "data" key if present
    if isinstance(dataset, dict) and "data" in dataset:
        dataset = dataset["data"]

    new_data = []
    for article in dataset:
        for para in article["paragraphs"]:
            new_data.append({
                "title": article.get("title", ""),
                "context": para["context"],
                "qas": para["qas"]
            })
    return new_data

## Paso 5: Evaluar el Modelo

In [24]:
def evaluate_model(model_name, model_path, dataset_dict):
    print(f"\n🔹 Evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    for ds_name, ds_path in dataset_dict.items():
        print(f"\n📘 Dataset: {ds_name}")
        dataset = json.load(open(ds_path, "r", encoding="utf-8"))
        flat_eval = flatten_squad(dataset)

        predictions, references = [], []

        for ex in tqdm(flat_eval):
            for qa in ex["qas"]:
                if qa.get("is_impossible", False):
                    predictions.append({
                        "id": qa["id"],
                        "prediction_text": "",
                        "no_answer_probability": 1.0
                    })
                    references.append({
                        "id": qa["id"],
                        "answers": {"text": [], "answer_start": []}
                    })
                else:
                    try:
                        pred = qa_pipeline(
                            question=qa["question"],
                            context=ex["context"],
                            handle_impossible_answer=True
                        )
                        predictions.append({
                            "id": qa["id"],
                            "prediction_text": pred["answer"],
                            "no_answer_probability": 1.0 - pred.get("score", 0.0)
                        })
                        references.append({
                            "id": qa["id"],
                            "answers": {
                                "text": [a["text"] for a in qa["answers"]],
                                "answer_start": [a["answer_start"] for a in qa["answers"]]
                            }
                        })
                    except Exception as e:
                        print(f"⚠️ Skipped QA {qa.get('id', 'unknown')}: {e}")

        results = metric.compute(predictions=predictions, references=references)
        print(f"\n✅ Results for {ds_name}:")
        print(results)  # 👈 useful to confirm the keys
        print(f"   • Exact Match (EM): {results.get('exact_match', results.get('exact', 0)):.2f}")
        print(f"   • F1 Score: {results.get('f1', 0):.2f}")

## Ejecutar para ambos modelos

In [25]:
# 🚀 Run for both models
for model_name, model_path in models.items():
    evaluate_model(model_name, model_path, datasets)


🔹 Evaluating BERT (Base)


Some weights of the model checkpoint at mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



📘 Dataset: Colombia and Mexico Train


100%|██████████| 3235/3235 [2:19:24<00:00,  2.59s/it]  



✅ Results for Colombia and Mexico Train:
{'exact': 71.22425237801957, 'f1': 81.26662254248514, 'total': 14613, 'HasAns_exact': 58.60813072152771, 'HasAns_f1': 73.05336698625419, 'HasAns_total': 10159, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4454, 'best_exact': 71.22425237801957, 'best_exact_thresh': 0.9643369317054749, 'best_f1': 81.26662254248333, 'best_f1_thresh': 0.9983832836151123}
   • Exact Match (EM): 71.22
   • F1 Score: 81.27

📘 Dataset: Colombia and Mexico Eval


100%|██████████| 2845/2845 [1:01:13<00:00,  1.29s/it]



✅ Results for Colombia and Mexico Eval:
{'exact': 69.34376496886476, 'f1': 80.11290507369172, 'total': 6263, 'HasAns_exact': 56.09421449805625, 'HasAns_f1': 71.51775085216778, 'HasAns_total': 4373, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1890, 'best_exact': 69.34376496886476, 'best_exact_thresh': 0.9807323813438416, 'best_f1': 80.11290507369165, 'best_f1_thresh': 0.994681179523468}
   • Exact Match (EM): 69.34
   • F1 Score: 80.11

📘 Dataset: Colombia Train


100%|██████████| 2564/2564 [1:30:31<00:00,  2.12s/it]  



✅ Results for Colombia Train:
{'exact': 71.48709154996666, 'f1': 81.22203178633036, 'total': 10497, 'HasAns_exact': 58.75137816979052, 'HasAns_f1': 72.83457382319534, 'HasAns_total': 7256, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 3241, 'best_exact': 71.48709154996666, 'best_exact_thresh': 0.9643369317054749, 'best_f1': 81.22203178633085, 'best_f1_thresh': 0.9956186413764954}
   • Exact Match (EM): 71.49
   • F1 Score: 81.22

📘 Dataset: Mexico Train


100%|██████████| 1006/1006 [49:56<00:00,  2.98s/it] 



✅ Results for Mexico Train:
{'exact': 70.5539358600583, 'f1': 81.3803419709045, 'total': 4116, 'HasAns_exact': 58.25008611780916, 'HasAns_f1': 73.60023684197158, 'HasAns_total': 2903, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1213, 'best_exact': 70.5539358600583, 'best_exact_thresh': 0.9643369317054749, 'best_f1': 81.38034197090413, 'best_f1_thresh': 0.9983832836151123}
   • Exact Match (EM): 70.55
   • F1 Score: 81.38

📘 Dataset: Colombia Eval


100%|██████████| 2211/2211 [38:56<00:00,  1.06s/it]  



✅ Results for Colombia Eval:
{'exact': 69.6599244276506, 'f1': 80.0648323705853, 'total': 4499, 'HasAns_exact': 56.68041891463027, 'HasAns_f1': 71.53655374016638, 'HasAns_total': 3151, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1348, 'best_exact': 69.6599244276506, 'best_exact_thresh': 0.968170166015625, 'best_f1': 80.06483237058498, 'best_f1_thresh': 0.9943676590919495}
   • Exact Match (EM): 69.66
   • F1 Score: 80.06

📘 Dataset: Mexico Eval


100%|██████████| 880/880 [20:37<00:00,  1.41s/it]  



✅ Results for Mexico Eval:
{'exact': 68.5374149659864, 'f1': 80.23551226829183, 'total': 1764, 'HasAns_exact': 54.58265139116203, 'HasAns_f1': 71.46926648221498, 'HasAns_total': 1222, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 542, 'best_exact': 68.5374149659864, 'best_exact_thresh': 0.9807323813438416, 'best_f1': 80.23551226829194, 'best_f1_thresh': 0.994681179523468}
   • Exact Match (EM): 68.54
   • F1 Score: 80.24

🔹 Evaluating DistilBERT (Base)


Some weights of the model checkpoint at mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



📘 Dataset: Colombia and Mexico Train


100%|██████████| 3235/3235 [2:20:04<00:00,  2.60s/it]  



✅ Results for Colombia and Mexico Train:
{'exact': 69.63662492301376, 'f1': 80.65339573836589, 'total': 14613, 'HasAns_exact': 56.32444138202579, 'HasAns_f1': 72.1712837803682, 'HasAns_total': 10159, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4454, 'best_exact': 69.63662492301376, 'best_exact_thresh': 0.9932949542999268, 'best_f1': 80.65339573836398, 'best_f1_thresh': 0.9993553161621094}
   • Exact Match (EM): 69.64
   • F1 Score: 80.65

📘 Dataset: Colombia and Mexico Eval


100%|██████████| 2845/2845 [1:04:16<00:00,  1.36s/it]



✅ Results for Colombia and Mexico Eval:
{'exact': 68.51349193677152, 'f1': 80.1113882073937, 'total': 6263, 'HasAns_exact': 54.90509947404528, 'HasAns_f1': 71.51557839993242, 'HasAns_total': 4373, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1890, 'best_exact': 68.51349193677152, 'best_exact_thresh': 0.9864137172698975, 'best_f1': 80.11138820739401, 'best_f1_thresh': 0.9993077516555786}
   • Exact Match (EM): 68.51
   • F1 Score: 80.11

📘 Dataset: Colombia Train


100%|██████████| 2564/2564 [1:33:15<00:00,  2.18s/it]  



✅ Results for Colombia Train:
{'exact': 70.02000571591883, 'f1': 80.49021422410055, 'total': 10497, 'HasAns_exact': 56.628996692392505, 'HasAns_f1': 71.77587909459476, 'HasAns_total': 7256, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 3241, 'best_exact': 70.02000571591883, 'best_exact_thresh': 0.9932949542999268, 'best_f1': 80.49021422410104, 'best_f1_thresh': 0.9992630481719971}
   • Exact Match (EM): 70.02
   • F1 Score: 80.49

📘 Dataset: Mexico Train


100%|██████████| 1006/1006 [50:13<00:00,  3.00s/it] 



✅ Results for Mexico Train:
{'exact': 68.65889212827989, 'f1': 81.06955617453178, 'total': 4116, 'HasAns_exact': 55.56321047192559, 'HasAns_f1': 73.15959118648775, 'HasAns_total': 2903, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1213, 'best_exact': 68.65889212827989, 'best_exact_thresh': 0.9733725786209106, 'best_f1': 81.06955617453147, 'best_f1_thresh': 0.9993553161621094}
   • Exact Match (EM): 68.66
   • F1 Score: 81.07

📘 Dataset: Colombia Eval


100%|██████████| 2211/2211 [39:32<00:00,  1.07s/it]  



✅ Results for Colombia Eval:
{'exact': 68.90420093354079, 'f1': 80.20775625268904, 'total': 4499, 'HasAns_exact': 55.60139638210092, 'HasAns_f1': 71.74062055882223, 'HasAns_total': 3151, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1348, 'best_exact': 68.90420093354079, 'best_exact_thresh': 0.971991777420044, 'best_f1': 80.20775625268875, 'best_f1_thresh': 0.9993077516555786}
   • Exact Match (EM): 68.90
   • F1 Score: 80.21

📘 Dataset: Mexico Eval


100%|██████████| 880/880 [21:25<00:00,  1.46s/it]  



✅ Results for Mexico Eval:
{'exact': 67.51700680272108, 'f1': 79.86560598756057, 'total': 1764, 'HasAns_exact': 53.10965630114566, 'HasAns_f1': 70.93529374963727, 'HasAns_total': 1222, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 542, 'best_exact': 67.51700680272108, 'best_exact_thresh': 0.9864137172698975, 'best_f1': 79.86560598756068, 'best_f1_thresh': 0.9991227984428406}
   • Exact Match (EM): 67.52
   • F1 Score: 79.87

🔹 Evaluating BERT (Fine-tuned News QA)


Device set to use cpu



📘 Dataset: Colombia and Mexico Train


100%|██████████| 3235/3235 [2:07:58<00:00,  2.37s/it]  



✅ Results for Colombia and Mexico Train:
{'exact': 88.35283651543146, 'f1': 92.23009713323718, 'total': 14613, 'HasAns_exact': 83.24638251796436, 'HasAns_f1': 88.82354655064495, 'HasAns_total': 10159, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4454, 'best_exact': 88.35283651543146, 'best_exact_thresh': 0.953493058681488, 'best_f1': 92.23009713323636, 'best_f1_thresh': 0.9999802112579346}
   • Exact Match (EM): 88.35
   • F1 Score: 92.23

📘 Dataset: Colombia and Mexico Eval


100%|██████████| 2845/2845 [53:02<00:00,  1.12s/it]  



✅ Results for Colombia and Mexico Eval:
{'exact': 78.42886795465432, 'f1': 84.37425737867055, 'total': 6263, 'HasAns_exact': 69.10587697233021, 'HasAns_f1': 77.62084929398881, 'HasAns_total': 4373, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1890, 'best_exact': 78.42886795465432, 'best_exact_thresh': 0.9982497692108154, 'best_f1': 84.37425737867095, 'best_f1_thresh': 0.9999784231185913}
   • Exact Match (EM): 78.43
   • F1 Score: 84.37

📘 Dataset: Colombia Train


100%|██████████| 2564/2564 [1:23:17<00:00,  1.95s/it]  



✅ Results for Colombia Train:
{'exact': 88.39668476707631, 'f1': 92.220301347696, 'total': 10497, 'HasAns_exact': 83.21389195148842, 'HasAns_f1': 88.74538357866118, 'HasAns_total': 7256, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 3241, 'best_exact': 88.39668476707631, 'best_exact_thresh': 0.953493058681488, 'best_f1': 92.22030134769501, 'best_f1_thresh': 0.9999802112579346}
   • Exact Match (EM): 88.40
   • F1 Score: 92.22

📘 Dataset: Mexico Train


100%|██████████| 1006/1006 [45:16<00:00,  2.70s/it] 



✅ Results for Mexico Train:
{'exact': 88.24101068999028, 'f1': 92.25507924228305, 'total': 4116, 'HasAns_exact': 83.3275921460558, 'HasAns_f1': 89.01891359326116, 'HasAns_total': 2903, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1213, 'best_exact': 88.24101068999028, 'best_exact_thresh': 0.9047306776046753, 'best_f1': 92.25507924228287, 'best_f1_thresh': 0.9999642968177795}
   • Exact Match (EM): 88.24
   • F1 Score: 92.26

📘 Dataset: Colombia Eval


100%|██████████| 2211/2211 [35:52<00:00,  1.03it/s]  



✅ Results for Colombia Eval:
{'exact': 78.01733718604135, 'f1': 83.97446106885951, 'total': 4499, 'HasAns_exact': 68.61313868613139, 'HasAns_f1': 77.11872432522999, 'HasAns_total': 3151, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1348, 'best_exact': 78.01733718604135, 'best_exact_thresh': 0.9982497692108154, 'best_f1': 83.97446106885934, 'best_f1_thresh': 0.9999523162841797}
   • Exact Match (EM): 78.02
   • F1 Score: 83.97

📘 Dataset: Mexico Eval


100%|██████████| 880/880 [21:45<00:00,  1.48s/it]



✅ Results for Mexico Eval:
{'exact': 79.47845804988663, 'f1': 85.3939192822075, 'total': 1764, 'HasAns_exact': 70.37643207855974, 'HasAns_f1': 78.91560852194272, 'HasAns_total': 1222, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 542, 'best_exact': 79.47845804988663, 'best_exact_thresh': 0.9959526062011719, 'best_f1': 85.39391928220763, 'best_f1_thresh': 0.9999784231185913}
   • Exact Match (EM): 79.48
   • F1 Score: 85.39

🔹 Evaluating DistilBERT (Fine-tuned News QA)


Device set to use cpu



📘 Dataset: Colombia and Mexico Train


100%|██████████| 3235/3235 [2:12:25<00:00,  2.46s/it]  



✅ Results for Colombia and Mexico Train:
{'exact': 88.31177718469856, 'f1': 92.15736917269533, 'total': 14613, 'HasAns_exact': 83.18732158677035, 'HasAns_f1': 88.7189325446012, 'HasAns_total': 10159, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4454, 'best_exact': 88.31177718469856, 'best_exact_thresh': 0.9998776316642761, 'best_f1': 92.15736917269462, 'best_f1_thresh': 0.9999001026153564}
   • Exact Match (EM): 88.31
   • F1 Score: 92.16

📘 Dataset: Colombia and Mexico Eval


100%|██████████| 2845/2845 [53:08<00:00,  1.12s/it]  



✅ Results for Colombia and Mexico Eval:
{'exact': 78.62046942359892, 'f1': 84.59307474959367, 'total': 6263, 'HasAns_exact': 69.38028813171735, 'HasAns_f1': 77.93423900221907, 'HasAns_total': 4373, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1890, 'best_exact': 78.62046942359892, 'best_exact_thresh': 0.9985507130622864, 'best_f1': 84.59307474959402, 'best_f1_thresh': 0.9998937249183655}
   • Exact Match (EM): 78.62
   • F1 Score: 84.59

📘 Dataset: Colombia Train


100%|██████████| 2564/2564 [1:19:40<00:00,  1.86s/it]



✅ Results for Colombia Train:
{'exact': 88.3014194531771, 'f1': 92.08420492157332, 'total': 10497, 'HasAns_exact': 83.0760749724366, 'HasAns_f1': 88.54849766562249, 'HasAns_total': 7256, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 3241, 'best_exact': 88.3014194531771, 'best_exact_thresh': 0.9998776316642761, 'best_f1': 92.0842049215725, 'best_f1_thresh': 0.9999001026153564}
   • Exact Match (EM): 88.30
   • F1 Score: 92.08

📘 Dataset: Mexico Train


100%|██████████| 1006/1006 [44:02<00:00,  2.63s/it] 



✅ Results for Mexico Train:
{'exact': 88.33819241982508, 'f1': 92.34395934374338, 'total': 4116, 'HasAns_exact': 83.4653806407165, 'HasAns_f1': 89.14493167717815, 'HasAns_total': 2903, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1213, 'best_exact': 88.33819241982508, 'best_exact_thresh': 0.9492854475975037, 'best_f1': 92.34395934374325, 'best_f1_thresh': 0.9998850226402283}
   • Exact Match (EM): 88.34
   • F1 Score: 92.34

📘 Dataset: Colombia Eval


100%|██████████| 2211/2211 [35:06<00:00,  1.05it/s]  



✅ Results for Colombia Eval:
{'exact': 78.0395643476328, 'f1': 84.12246541797725, 'total': 4499, 'HasAns_exact': 68.64487464297049, 'HasAns_f1': 77.33004503823568, 'HasAns_total': 3151, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1348, 'best_exact': 78.0395643476328, 'best_exact_thresh': 0.9985507130622864, 'best_f1': 84.12246541797711, 'best_f1_thresh': 0.9998937249183655}
   • Exact Match (EM): 78.04
   • F1 Score: 84.12

📘 Dataset: Mexico Eval


100%|██████████| 880/880 [19:13<00:00,  1.31s/it]



✅ Results for Mexico Eval:
{'exact': 80.10204081632654, 'f1': 85.79334197348315, 'total': 1764, 'HasAns_exact': 71.27659574468085, 'HasAns_f1': 79.49218923177104, 'HasAns_total': 1222, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 542, 'best_exact': 80.10204081632654, 'best_exact_thresh': 0.9905273914337158, 'best_f1': 85.79334197348324, 'best_f1_thresh': 0.9998733401298523}
   • Exact Match (EM): 80.10
   • F1 Score: 85.79
