# Evaluaci√≥n de Modelos con Conjunto de Datos Propio

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


## Paso 1: Cargar las Librer√≠as

In [3]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import json
import requests

In [4]:
# URLs of the files
urls = {
    "eval_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_colombia_mexico_dataset.json",
    "train_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_colombia_mexico_dataset.json",
    "train_colombia_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_colombia_dataset.json",
    "train_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_mexico_dataset.json",
    "eval_colombia_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_colombia_dataset.json",
    "eval_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_mexico_dataset.json",
}

# Dictionary to store the loaded JSON data
datasets = {}

for filename, url in urls.items():
    # Download the file
    response = requests.get(url)
    if response.status_code == 200:
        # Save locally
        with open(filename, "wb") as f:
            f.write(response.content)
        # Load JSON into Python
        datasets[filename] = response.json()
        print(f"{filename} downloaded and loaded successfully!")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

eval_colombia_mexico_dataset.json downloaded and loaded successfully!
train_colombia_mexico_dataset.json downloaded and loaded successfully!
train_colombia_dataset.json downloaded and loaded successfully!
train_mexico_dataset.json downloaded and loaded successfully!
eval_colombia_dataset.json downloaded and loaded successfully!
eval_mexico_dataset.json downloaded and loaded successfully!


## Paso 2: Cargar los modelos

In [5]:
models = {
    "BERT (Base)": "mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",
    "DistilBERT (Base)": "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",
    "BERT (Fine-tuned News QA)": "BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    "DistilBERT (Fine-tuned News QA)": "BlueAutomata/distill-bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
}

## Paso 3: Cargar los conjuntos de datos

In [6]:
GOLD_PATH   = "" #"../datasets/exploration_datasets/gold/"

In [7]:
datasets = {
    "Colombia and Mexico Train": GOLD_PATH + "train_colombia_mexico_dataset.json",
    "Colombia and Mexico Eval": GOLD_PATH + "eval_colombia_mexico_dataset.json",
    "Colombia Train": GOLD_PATH + "train_colombia_dataset.json",
    "Mexico Train": GOLD_PATH + "train_mexico_dataset.json",
    "Colombia Eval": GOLD_PATH + "eval_colombia_dataset.json",
    "Mexico Eval": GOLD_PATH + "eval_mexico_dataset.json"
}

## Paso 4: Cargar las m√©tricas

In [8]:
# üßÆ Metric (SQuAD v2)
metric = evaluate.load("squad_v2")

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

## Paso 5: Transformar el conjunto de datos a espa√±ol: Aplanar a formato Contexto + Preguntas y Respuestas (QA

In [9]:
def flatten_squad(dataset):
    # Handle "data" key if present
    if isinstance(dataset, dict) and "data" in dataset:
        dataset = dataset["data"]

    new_data = []
    for article in dataset:
        for para in article["paragraphs"]:
            new_data.append({
                "title": article.get("title", ""),
                "context": para["context"],
                "qas": para["qas"]
            })
    return new_data

## Paso 5: Evaluar el Modelo

In [10]:
def evaluate_model(model_name, model_path, dataset_dict):
    print(f"\nüîπ Evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    for ds_name, ds_path in dataset_dict.items():
        print(f"\nüìò Dataset: {ds_name}")
        dataset = json.load(open(ds_path, "r", encoding="utf-8"))
        flat_eval = flatten_squad(dataset)

        predictions, references = [], []

        for ex in tqdm(flat_eval):
            for qa in ex["qas"]:
                if qa.get("is_impossible", False):
                    predictions.append({
                        "id": qa["id"],
                        "prediction_text": "",
                        "no_answer_probability": 1.0
                    })
                    references.append({
                        "id": qa["id"],
                        "answers": {"text": [], "answer_start": []}
                    })
                else:
                    try:
                        pred = qa_pipeline(
                            question=qa["question"],
                            context=ex["context"],
                            handle_impossible_answer=True
                        )
                        predictions.append({
                            "id": qa["id"],
                            "prediction_text": pred["answer"],
                            "no_answer_probability": 1.0 - pred.get("score", 0.0)
                        })
                        references.append({
                            "id": qa["id"],
                            "answers": {
                                "text": [a["text"] for a in qa["answers"]],
                                "answer_start": [a["answer_start"] for a in qa["answers"]]
                            }
                        })
                    except Exception as e:
                        print(f"‚ö†Ô∏è Skipped QA {qa.get('id', 'unknown')}: {e}")

        results = metric.compute(predictions=predictions, references=references)
        print(f"\n‚úÖ Results for {ds_name}:")
        print(results)  # üëà useful to confirm the keys
        print(f"   ‚Ä¢ Exact Match (EM): {results.get('exact_match', results.get('exact', 0)):.2f}")
        print(f"   ‚Ä¢ F1 Score: {results.get('f1', 0):.2f}")

## Ejecutar para ambos modelos

In [11]:
# üöÄ Run for both models
for model_name, model_path in models.items():
    evaluate_model(model_name, model_path, datasets)


üîπ Evaluating BERT (Base)


tokenizer_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



üìò Dataset: Colombia and Mexico Train


  0%|          | 0/4320 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

  0%|          | 2/4320 [00:01<39:23,  1.83it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4320/4320 [19:43<00:00,  3.65it/s]



‚úÖ Results for Colombia and Mexico Train:
{'exact': 71.62723867193513, 'f1': 81.02083539694854, 'total': 19487, 'HasAns_exact': 59.14431389935713, 'HasAns_f1': 72.6707322382573, 'HasAns_total': 13533, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 5954, 'best_exact': 71.62723867193513, 'best_exact_thresh': 0.9799644947052002, 'best_f1': 81.02083539694496, 'best_f1_thresh': 0.9966019988059998}
   ‚Ä¢ Exact Match (EM): 71.63
   ‚Ä¢ F1 Score: 81.02

üìò Dataset: Colombia and Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3846/3846 [08:23<00:00,  7.63it/s]



‚úÖ Results for Colombia and Mexico Eval:
{'exact': 72.15038314176245, 'f1': 81.25316608777382, 'total': 8352, 'HasAns_exact': 59.69502685843008, 'HasAns_f1': 72.86890368481838, 'HasAns_total': 5771, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 2581, 'best_exact': 72.15038314176245, 'best_exact_thresh': 0.9657954573631287, 'best_f1': 81.25316608777472, 'best_f1_thresh': 0.9983832836151123}
   ‚Ä¢ Exact Match (EM): 72.15
   ‚Ä¢ F1 Score: 81.25

üìò Dataset: Colombia Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3023/3023 [12:23<00:00,  4.07it/s]



‚úÖ Results for Colombia Train:
{'exact': 71.36576787807738, 'f1': 80.57700221372909, 'total': 13648, 'HasAns_exact': 58.754617414248024, 'HasAns_f1': 72.02268350532712, 'HasAns_total': 9475, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4173, 'best_exact': 71.36576787807738, 'best_exact_thresh': 0.9769387245178223, 'best_f1': 80.57700221372727, 'best_f1_thresh': 0.9966019988059998}
   ‚Ä¢ Exact Match (EM): 71.37
   ‚Ä¢ F1 Score: 80.58

üìò Dataset: Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1297/1297 [07:23<00:00,  2.92it/s]



‚úÖ Results for Mexico Train:
{'exact': 72.23839698578524, 'f1': 82.05824510487439, 'total': 5839, 'HasAns_exact': 60.054213898472156, 'HasAns_f1': 74.18385736011867, 'HasAns_total': 4058, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1781, 'best_exact': 72.23839698578524, 'best_exact_thresh': 0.9799644947052002, 'best_f1': 82.0582451048746, 'best_f1_thresh': 0.9936692118644714}
   ‚Ä¢ Exact Match (EM): 72.24
   ‚Ä¢ F1 Score: 82.06

üìò Dataset: Colombia Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2690/2690 [05:07<00:00,  8.76it/s]



‚úÖ Results for Colombia Eval:
{'exact': 71.63617712429475, 'f1': 80.80699729850706, 'total': 5849, 'HasAns_exact': 58.996539792387544, 'HasAns_f1': 72.25410954003158, 'HasAns_total': 4046, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1803, 'best_exact': 71.63617712429475, 'best_exact_thresh': 0.958970308303833, 'best_f1': 80.80699729850701, 'best_f1_thresh': 0.9975573420524597}
   ‚Ä¢ Exact Match (EM): 71.64
   ‚Ä¢ F1 Score: 80.81

üìò Dataset: Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1156/1156 [03:14<00:00,  5.95it/s]



‚úÖ Results for Mexico Eval:
{'exact': 73.35197762684778, 'f1': 82.2957714606948, 'total': 2503, 'HasAns_exact': 61.333333333333336, 'HasAns_f1': 74.31090780644585, 'HasAns_total': 1725, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 778, 'best_exact': 73.35197762684778, 'best_exact_thresh': 0.9657954573631287, 'best_f1': 82.29577146069505, 'best_f1_thresh': 0.9983832836151123}
   ‚Ä¢ Exact Match (EM): 73.35
   ‚Ä¢ F1 Score: 82.30

üîπ Evaluating DistilBERT (Base)


tokenizer_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



üìò Dataset: Colombia and Mexico Train


  0%|          | 3/4320 [00:01<25:56,  2.77it/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4320/4320 [19:50<00:00,  3.63it/s]



‚úÖ Results for Colombia and Mexico Train:
{'exact': 70.00051316262123, 'f1': 80.32798677764663, 'total': 19487, 'HasAns_exact': 56.801891672208676, 'HasAns_f1': 71.6730568488879, 'HasAns_total': 13533, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 5954, 'best_exact': 70.00051316262123, 'best_exact_thresh': 0.9976862668991089, 'best_f1': 80.32798677764319, 'best_f1_thresh': 0.9992315769195557}
   ‚Ä¢ Exact Match (EM): 70.00
   ‚Ä¢ F1 Score: 80.33

üìò Dataset: Colombia and Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3846/3846 [08:26<00:00,  7.60it/s]



‚úÖ Results for Colombia and Mexico Eval:
{'exact': 70.3544061302682, 'f1': 80.25769982868067, 'total': 8352, 'HasAns_exact': 57.09582394732282, 'HasAns_f1': 71.42822889778911, 'HasAns_total': 5771, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 2581, 'best_exact': 70.3544061302682, 'best_exact_thresh': 0.9826064109802246, 'best_f1': 80.25769982868151, 'best_f1_thresh': 0.9993278980255127}
   ‚Ä¢ Exact Match (EM): 70.35
   ‚Ä¢ F1 Score: 80.26

üìò Dataset: Colombia Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3023/3023 [12:22<00:00,  4.07it/s]



‚úÖ Results for Colombia Train:
{'exact': 69.74648300117234, 'f1': 79.87376558105935, 'total': 13648, 'HasAns_exact': 56.422163588390504, 'HasAns_f1': 71.00972587338237, 'HasAns_total': 9475, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4173, 'best_exact': 69.74648300117234, 'best_exact_thresh': 0.9969624280929565, 'best_f1': 79.87376558105744, 'best_f1_thresh': 0.9991493821144104}
   ‚Ä¢ Exact Match (EM): 69.75
   ‚Ä¢ F1 Score: 79.87

üìò Dataset: Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1297/1297 [07:23<00:00,  2.92it/s]



‚úÖ Results for Mexico Train:
{'exact': 70.59427984243878, 'f1': 81.38967728818325, 'total': 5839, 'HasAns_exact': 57.68851651059635, 'HasAns_f1': 73.22186438780236, 'HasAns_total': 4058, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1781, 'best_exact': 70.59427984243878, 'best_exact_thresh': 0.9976862668991089, 'best_f1': 81.38967728818345, 'best_f1_thresh': 0.9992315769195557}
   ‚Ä¢ Exact Match (EM): 70.59
   ‚Ä¢ F1 Score: 81.39

üìò Dataset: Colombia Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2690/2690 [05:07<00:00,  8.74it/s]



‚úÖ Results for Colombia Eval:
{'exact': 69.96067703881005, 'f1': 79.97170628583623, 'total': 5849, 'HasAns_exact': 56.57439446366782, 'HasAns_f1': 71.04659171177858, 'HasAns_total': 4046, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1803, 'best_exact': 69.96067703881005, 'best_exact_thresh': 0.9826064109802246, 'best_f1': 79.97170628583636, 'best_f1_thresh': 0.9993278980255127}
   ‚Ä¢ Exact Match (EM): 69.96
   ‚Ä¢ F1 Score: 79.97

üìò Dataset: Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1156/1156 [03:15<00:00,  5.92it/s]



‚úÖ Results for Mexico Eval:
{'exact': 71.27447063523772, 'f1': 80.92600835129238, 'total': 2503, 'HasAns_exact': 58.31884057971015, 'HasAns_f1': 72.32336168306367, 'HasAns_total': 1725, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 778, 'best_exact': 71.27447063523772, 'best_exact_thresh': 0.9657282829284668, 'best_f1': 80.92600835129264, 'best_f1_thresh': 0.9985851049423218}
   ‚Ä¢ Exact Match (EM): 71.27
   ‚Ä¢ F1 Score: 80.93

üîπ Evaluating BERT (Fine-tuned News QA)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Device set to use cuda:0



üìò Dataset: Colombia and Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4320/4320 [18:27<00:00,  3.90it/s]



‚úÖ Results for Colombia and Mexico Train:
{'exact': 98.01919228203418, 'f1': 98.74657414181387, 'total': 19487, 'HasAns_exact': 97.14771299785708, 'HasAns_f1': 98.19511492658884, 'HasAns_total': 13533, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 5954, 'best_exact': 98.01919228203418, 'best_exact_thresh': 0.9999620914459229, 'best_f1': 98.74657414181435, 'best_f1_thresh': 0.9999993443489075}
   ‚Ä¢ Exact Match (EM): 98.02
   ‚Ä¢ F1 Score: 98.75

üìò Dataset: Colombia and Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3846/3846 [07:51<00:00,  8.16it/s]



‚úÖ Results for Colombia and Mexico Eval:
{'exact': 78.73563218390805, 'f1': 83.96143722179228, 'total': 8352, 'HasAns_exact': 69.22543753249003, 'HasAns_f1': 76.78841165766924, 'HasAns_total': 5771, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 2581, 'best_exact': 78.73563218390805, 'best_exact_thresh': 0.9996415972709656, 'best_f1': 83.96143722179285, 'best_f1_thresh': 0.9999989867210388}
   ‚Ä¢ Exact Match (EM): 78.74
   ‚Ä¢ F1 Score: 83.96

üìò Dataset: Colombia Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3023/3023 [11:36<00:00,  4.34it/s]



‚úÖ Results for Colombia Train:
{'exact': 97.94109026963658, 'f1': 98.68525662852048, 'total': 13648, 'HasAns_exact': 97.03430079155673, 'HasAns_f1': 98.10621450828997, 'HasAns_total': 9475, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4173, 'best_exact': 97.94109026963658, 'best_exact_thresh': 0.9999620914459229, 'best_f1': 98.68525662852035, 'best_f1_thresh': 0.9999993443489075}
   ‚Ä¢ Exact Match (EM): 97.94
   ‚Ä¢ F1 Score: 98.69

üìò Dataset: Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1297/1297 [06:56<00:00,  3.12it/s]



‚úÖ Results for Mexico Train:
{'exact': 98.2017468744648, 'f1': 98.88989687197798, 'total': 5839, 'HasAns_exact': 97.41251848201084, 'HasAns_f1': 98.40268798311469, 'HasAns_total': 4058, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1781, 'best_exact': 98.2017468744648, 'best_exact_thresh': 0.9987741112709045, 'best_f1': 98.88989687197802, 'best_f1_thresh': 0.9999993443489075}
   ‚Ä¢ Exact Match (EM): 98.20
   ‚Ä¢ F1 Score: 98.89

üìò Dataset: Colombia Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2690/2690 [04:50<00:00,  9.27it/s]



‚úÖ Results for Colombia Eval:
{'exact': 78.59463156095059, 'f1': 84.03536126884119, 'total': 5849, 'HasAns_exact': 69.05585763717252, 'HasAns_f1': 76.92111420203959, 'HasAns_total': 4046, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1803, 'best_exact': 78.59463156095059, 'best_exact_thresh': 0.9996415972709656, 'best_f1': 84.03536126884164, 'best_f1_thresh': 0.9999988675117493}
   ‚Ä¢ Exact Match (EM): 78.59
   ‚Ä¢ F1 Score: 84.04

üìò Dataset: Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1156/1156 [03:02<00:00,  6.33it/s]



‚úÖ Results for Mexico Eval:
{'exact': 79.06512185377547, 'f1': 83.78869181580382, 'total': 2503, 'HasAns_exact': 69.6231884057971, 'HasAns_f1': 76.47715687823593, 'HasAns_total': 1725, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 778, 'best_exact': 79.06512185377547, 'best_exact_thresh': 0.972225546836853, 'best_f1': 83.78869181580389, 'best_f1_thresh': 0.9999989867210388}
   ‚Ä¢ Exact Match (EM): 79.07
   ‚Ä¢ F1 Score: 83.79

üîπ Evaluating DistilBERT (Fine-tuned News QA)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Device set to use cuda:0



üìò Dataset: Colombia and Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4320/4320 [18:31<00:00,  3.89it/s]



‚úÖ Results for Colombia and Mexico Train:
{'exact': 97.97300764612305, 'f1': 98.61306295799112, 'total': 19487, 'HasAns_exact': 97.08120889677086, 'HasAns_f1': 98.00286395199683, 'HasAns_total': 13533, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 5954, 'best_exact': 97.97300764612305, 'best_exact_thresh': 0.9779223203659058, 'best_f1': 98.61306295799163, 'best_f1_thresh': 0.999991774559021}
   ‚Ä¢ Exact Match (EM): 97.97
   ‚Ä¢ F1 Score: 98.61

üìò Dataset: Colombia and Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3846/3846 [07:52<00:00,  8.14it/s]



‚úÖ Results for Colombia and Mexico Eval:
{'exact': 78.83141762452107, 'f1': 83.98727832634647, 'total': 8352, 'HasAns_exact': 69.36406168774909, 'HasAns_f1': 76.82580983913458, 'HasAns_total': 5771, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 2581, 'best_exact': 78.83141762452107, 'best_exact_thresh': 0.9999402165412903, 'best_f1': 83.98727832634698, 'best_f1_thresh': 0.9999915957450867}
   ‚Ä¢ Exact Match (EM): 78.83
   ‚Ä¢ F1 Score: 83.99

üìò Dataset: Colombia Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3023/3023 [11:36<00:00,  4.34it/s]



‚úÖ Results for Colombia Train:
{'exact': 97.91910902696365, 'f1': 98.56288859047314, 'total': 13648, 'HasAns_exact': 97.00263852242745, 'HasAns_f1': 97.92995287417176, 'HasAns_total': 9475, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 4173, 'best_exact': 97.91910902696365, 'best_exact_thresh': 0.8873447179794312, 'best_f1': 98.56288859047301, 'best_f1_thresh': 0.999991774559021}
   ‚Ä¢ Exact Match (EM): 97.92
   ‚Ä¢ F1 Score: 98.56

üìò Dataset: Mexico Train


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1297/1297 [06:56<00:00,  3.12it/s]



‚úÖ Results for Mexico Train:
{'exact': 98.09898955300565, 'f1': 98.73033984922, 'total': 5839, 'HasAns_exact': 97.2646623952686, 'HasAns_f1': 98.17310359280324, 'HasAns_total': 4058, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1781, 'best_exact': 98.09898955300565, 'best_exact_thresh': 0.9779223203659058, 'best_f1': 98.73033984922002, 'best_f1_thresh': 0.9999884366989136}
   ‚Ä¢ Exact Match (EM): 98.10
   ‚Ä¢ F1 Score: 98.73

üìò Dataset: Colombia Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2690/2690 [04:49<00:00,  9.29it/s]



‚úÖ Results for Colombia Eval:
{'exact': 78.71431013848522, 'f1': 83.9610608989761, 'total': 5849, 'HasAns_exact': 69.22886801779535, 'HasAns_f1': 76.8137037068985, 'HasAns_total': 4046, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1803, 'best_exact': 78.71431013848522, 'best_exact_thresh': 0.9999402165412903, 'best_f1': 83.96106089897656, 'best_f1_thresh': 0.9999915957450867}
   ‚Ä¢ Exact Match (EM): 78.71
   ‚Ä¢ F1 Score: 83.96

üìò Dataset: Mexico Eval


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1156/1156 [03:02<00:00,  6.34it/s]



‚úÖ Results for Mexico Eval:
{'exact': 79.10507391130643, 'f1': 84.04854310169175, 'total': 2503, 'HasAns_exact': 69.68115942028986, 'HasAns_f1': 76.85420486001995, 'HasAns_total': 1725, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 778, 'best_exact': 79.10507391130643, 'best_exact_thresh': 0.9972954988479614, 'best_f1': 84.04854310169178, 'best_f1_thresh': 0.9999886155128479}
   ‚Ä¢ Exact Match (EM): 79.11
   ‚Ä¢ F1 Score: 84.05
