# Entrenamiento con Simple Transformers del model Distill BERT

## Paso 1: Cargar datos en Google Colab

## Paso 2: Instalar librerías

In [None]:
!pip install transformers evaluate torch --quiet
!pip install simpletransformers transformers datasets huggingface_hub scikit-learn
!pip install evaluate --quiet

## Paso 3: Cargar librerías

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import evaluate
import json
import os
import shutil
import requests


from tqdm import tqdm
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
from sklearn.model_selection import train_test_split
from google.colab import files

In [None]:
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

## Paso 4: Cargar datos

In [None]:
# URLs of the files
urls = {
    "eval_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/eval_colombia_mexico_dataset.json",
    "train_colombia_mexico_dataset.json": "https://github.com/BlueAutomata/tesis-optimizacion-de-modelos-de-question-answering/raw/refs/heads/master/src/datasets/exploration_datasets/gold/train_colombia_mexico_dataset.json"
}

# Dictionary to store the loaded JSON data
datasets = {}

for filename, url in urls.items():
    # Download the file
    response = requests.get(url)
    if response.status_code == 200:
        # Save locally
        with open(filename, "wb") as f:
            f.write(response.content)
        # Load JSON into Python
        datasets[filename] = response.json()
        print(f"{filename} downloaded and loaded successfully!")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

In [None]:
with open("train_colombia_mexico_dataset.json", "r", encoding="utf-8") as f:
    train_dataset = json.load(f)

print(f"✅ Loaded {len(train_dataset)} records successfully!")

In [None]:
with open("eval_colombia_mexico_dataset.json", "r", encoding="utf-8") as f:
    eval_dataset = json.load(f)

print(f"✅ Loaded {len(eval_dataset)} records successfully!")

In [None]:
train_dataset = train_dataset["data"]

In [None]:
eval_dataset = eval_dataset["data"]

In [None]:
# 🔧 1️⃣ Flatten your dataset so each row has 'context' and 'qas'
def flatten_squad(dataset):
    new_data = []
    for article in dataset:
        for para in article["paragraphs"]:
            new_data.append({
                "context": para["context"],
                "qas": para["qas"]
            })
    return new_data

In [None]:
train_data = flatten_squad(train_dataset)

In [None]:
eval_data = flatten_squad(eval_dataset)

In [None]:
print(f"✅ Training samples: {len(train_data)}")
print(f"✅ Eval samples: {len(eval_data)}")

## Paso 5: Definir hiperparámetros

In [None]:
model_args = QuestionAnsweringArgs()

# Training behavior
model_args.train_batch_size = 8
model_args.eval_batch_size = 8
model_args.num_train_epochs = 2
model_args.learning_rate = 5e-6
model_args.gradient_accumulation_steps = 1
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.save_steps = -1
model_args.best_model_dir = "./outputs/best_model/"
model_args.output_dir = "./outputs/"

# Optimization
model_args.max_seq_length = 384
model_args.doc_stride = 128
model_args.warmup_ratio = 0.1
model_args.max_answer_length = 30

# Logging
model_args.logging_steps = 100
model_args.evaluate_during_training_verbose = True
model_args.manual_seed = 42

# 🔹 Sliding window parameters
model_args.max_seq_length = 384          # maximum total input sequence length after tokenization
model_args.doc_stride = 128              # overlap between two sliding windows
model_args.max_query_length = 64         # maximum length of the question

# Resource handling
model_args.use_multiprocessing = False  # safer for notebooks
model_args.fp16 = torch.cuda.is_available()  # use mixed precision if CUDA available

## Paso 6: Cargar el modelo

In [None]:
model_original = QuestionAnsweringModel(
    model_type="bert",
    model_name="mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",  # BETO
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

In [None]:
model = QuestionAnsweringModel(
    model_type="bert",
    model_name="mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",  # BETO
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

## Paso 7: Entrenamiento del modelo

In [None]:
model.train_model(train_data, eval_data=eval_data)

## Paso 8: Evaluación de los resultados

In [None]:
result_original, texts_original = model_original.eval_model(eval_data)
print("📊 Evaluation results:")
print(result_original)

In [None]:
result, texts = model.eval_model(eval_data)
print("📊 Evaluation results:")
print(result)

In [None]:
correct = result_original['correct']
similar = result_original['similar']
incorrect = result_original['incorrect']
total = correct + similar + incorrect

# 1️⃣ Exact Match Accuracy
exact_match = correct / total

# 2️⃣ Weighted Accuracy (partial credit for 'similar')
weighted_accuracy = (correct + 0.5 * similar) / total

# 3️⃣ F1 Score approximation
TP = correct + 0.5 * similar
FN = 0.5 * similar + incorrect
# Assuming FP = 0 (as Simple Transformers counts predictions, not negatives)
precision = TP / TP
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print results
print(f"Exact Match (EM): {exact_match:.4f} → {exact_match*100:.2f}%")
print(f"Weighted Accuracy: {weighted_accuracy:.4f} → {weighted_accuracy*100:.2f}%")
print(f"F1 Score: {f1_score:.4f} → {f1_score*100:.2f}%")

In [None]:
correct = result['correct']
similar = result['similar']
incorrect = result['incorrect']
total = correct + similar + incorrect

# 1️⃣ Exact Match Accuracy
exact_match = correct / total

# 2️⃣ Weighted Accuracy (partial credit for 'similar')
weighted_accuracy = (correct + 0.5 * similar) / total

# 3️⃣ F1 Score approximation
TP = correct + 0.5 * similar
FN = 0.5 * similar + incorrect
# Assuming FP = 0 (as Simple Transformers counts predictions, not negatives)
precision = TP / TP
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print results
print(f"Exact Match (EM): {exact_match:.4f} → {exact_match*100:.2f}%")
print(f"Weighted Accuracy: {weighted_accuracy:.4f} → {weighted_accuracy*100:.2f}%")
print(f"F1 Score: {f1_score:.4f} → {f1_score*100:.2f}%")

## Paso 9: Guardar los resultados

In [None]:
# Folder to save
local_path = "./QA_model_bert"
os.makedirs(local_path, exist_ok=True)

# Save the Hugging Face model & tokenizer directly
model.model.save_pretrained(local_path)       # Saves weights + config
model.tokenizer.save_pretrained(local_path)   # Saves vocab + tokenizer config

# Check files
!ls -l ./QA_model

In [None]:
shutil.make_archive("QA_model_bert", 'zip', local_path)
print("✅ Zipped model")
!ls -lh QA_model.zip

In [None]:
files.download("QA_model_bert.zip")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import login

# This will open a prompt for your Hugging Face token
login()

In [None]:
from huggingface_hub import whoami
print(whoami())

In [None]:

from huggingface_hub import login, create_repo
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# 1️⃣ Define your paths and repo name
model_dir = "./outputs"
repo_id = "BlueAutomata/distill-bert-base-spanish-wwm-cased-news-qa-colombia-mexico"

# 2️⃣ Create the repo (won’t fail if it already exists)
create_repo(repo_id, private=False, exist_ok=True)

# 3️⃣ Load your SimpleTransformers model as a standard HF model
hf_model = AutoModelForQuestionAnswering.from_pretrained(model_dir)
hf_tokenizer = AutoTokenizer.from_pretrained(model_dir)

# 4️⃣ Push to the Hugging Face Hub
hf_model.push_to_hub(
    repo_id,
    description="Distill-BERT-base Spanish WWM cased model fine-tuned for extractive QA on news articles from Colombia and Mexico.",
    tags=["spanish", "qa", "news", "colombia", "mexico", "bert-base", "wwm", "cased"]
)

hf_tokenizer.push_to_hub(repo_id)

In [None]:

shutil.move("QA_model_bert.zip", "/content/drive/MyDrive/Tahesis_QA_Optimization/Model")

In [None]:
# Path to the folder containing the saved model
model_path = "./QA_model_bert"  # change if different

# Reload the model
my_model = QuestionAnsweringModel(
    "bert",
    model_path,
    use_cuda=True  # set to False if no GPU
)

In [None]:
# Context & question
context = "Ciudad de México. El capitán de la Secretaría de Marina, Abraham Jeremías Pérez Ramírez, fue hallado muerto en Tamaulipas."
question = "¿Quién fue hallado muerto en Tamaulipas?"

# Prepare input in SimpleTransformers format
to_predict = [
    {
        "context": context,
        "qas": [
            {
                "id": "0",
                "question": question,
                "answers": [{"text": " ", "answer_start": 0}],
                "is_impossible": False
            }
        ]
    }
]

# Run prediction
answers = my_model.predict(to_predict)
print(answers)

In [None]:
# Load your fine-tuned model from the Hub
qa = pipeline(
    "question-answering",
    model="BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    tokenizer="BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico"
)

# Provide Spanish context
contexto = """
El presidente Gustavo Petro anunció nuevas medidas para impulsar el uso de energías renovables en Colombia,
especialmente en la región del Caribe, donde los proyectos solares y eólicos han ganado protagonismo.
El objetivo del gobierno es reducir las emisiones de carbono en un 30% para el año 2030.
"""

# Ask questions in Spanish
preguntas = [
    "¿Quién anunció nuevas medidas para energías renovables?",
    "¿En qué región se impulsarán los proyectos solares y eólicos?",
    "¿Cuál es el objetivo del gobierno para 2030?"
]

# Evaluate each question
for pregunta in preguntas:
    respuesta = qa(question=pregunta, context=contexto)
    print(f"❓ {pregunta}\n💬 {respuesta['answer']}\n")

In [None]:
def flatten_squad(dataset):
    # If the dataset is a dict with "data", extract it
    if isinstance(dataset, dict) and "data" in dataset:
        dataset = dataset["data"]

    new_data = []
    for article in dataset:
        for para in article["paragraphs"]:
            new_data.append({
                "title": article.get("title", ""),
                "context": para["context"],
                "qas": para["qas"]
            })
    return new_data

In [None]:
flat_eval = flatten_squad(eval_dataset)

In [None]:
from transformers import pipeline
import evaluate

qa_pipeline = pipeline(
    "question-answering",
    model="BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico",
    tokenizer="BlueAutomata/bert-base-spanish-wwm-cased-news-qa-colombia-mexico"
)

metric = evaluate.load("squad")

In [None]:
from transformers import pipeline
import evaluate

qa_pipeline_original = pipeline(
    "question-answering",
    model="mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es",
    tokenizer="mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
)


In [None]:
predictions_original = []
references_original = []

for ex in flat_eval:
    context = ex["context"]
    for qa in ex["qas"]:
        if qa["is_impossible"]:
            continue  # Skip unanswerable questions

        # Run QA prediction
        pred = qa_pipeline_original(question=qa["question"], context=context)

        # Collect prediction and reference
        predictions_original.append({
            "id": qa["id"],
            "prediction_text": pred["answer"]
        })

        references_original.append({
            "id": qa["id"],
            "answers": {
                "text": [a["text"] for a in qa["answers"]],
                "answer_start": [a["answer_start"] for a in qa["answers"]]
            }
        })

In [None]:
predictions = []
references = []

for ex in flat_eval:
    context = ex["context"]
    for qa in ex["qas"]:
        if qa["is_impossible"]:
            continue  # Skip unanswerable questions

        # Run QA prediction
        pred = qa_pipeline(question=qa["question"], context=context)

        # Collect prediction and reference
        predictions.append({
            "id": qa["id"],
            "prediction_text": pred["answer"]
        })

        references.append({
            "id": qa["id"],
            "answers": {
                "text": [a["text"] for a in qa["answers"]],
                "answer_start": [a["answer_start"] for a in qa["answers"]]
            }
        })


In [None]:
results = metric.compute(predictions=predictions_original, references=references_original)
print("📊 Evaluation results:")
print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

In [None]:
results = metric.compute(predictions=predictions, references=references)
print("📊 Evaluation results:")
print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

In [None]:
import evaluate
metric = evaluate.load("squad_v2")

In [None]:
from tqdm import tqdm

predictions_original = []
references_original = []

for ex in tqdm(flat_eval):
    for qa in ex["qas"]:
        if qa["is_impossible"]:
            # Questions that have no valid answer in the text
            predictions_original.append({
                "id": qa["id"],
                "prediction_text": "",
                "no_answer_probability": 1.0   # fully confident it's unanswerable
            })
            references_original.append({
                "id": qa["id"],
                "answers": {"text": [], "answer_start": []}
            })
        else:
            # Normal (answerable) questions
            pred = qa_pipeline_original(question=qa["question"], context=ex["context"])

            predictions_original.append({
                "id": qa["id"],
                "prediction_text": pred["answer"],
                # Use model confidence inversely as no-answer probability
                "no_answer_probability": 1.0 - pred.get("score", 0.0)
            })

            references_original.append({
                "id": qa["id"],
                "answers": {
                    "text": [a["text"] for a in qa["answers"]],
                    "answer_start": [a["answer_start"] for a in qa["answers"]]
                }
            })

In [None]:
from tqdm import tqdm

predictions = []
references = []

for ex in tqdm(flat_eval):
    for qa in ex["qas"]:
        if qa["is_impossible"]:
            # Questions that have no valid answer in the text
            predictions.append({
                "id": qa["id"],
                "prediction_text": "",
                "no_answer_probability": 1.0   # fully confident it's unanswerable
            })
            references.append({
                "id": qa["id"],
                "answers": {"text": [], "answer_start": []}
            })
        else:
            # Normal (answerable) questions
            pred = qa_pipeline(question=qa["question"], context=ex["context"])

            predictions.append({
                "id": qa["id"],
                "prediction_text": pred["answer"],
                # Use model confidence inversely as no-answer probability
                "no_answer_probability": 1.0 - pred.get("score", 0.0)
            })

            references.append({
                "id": qa["id"],
                "answers": {
                    "text": [a["text"] for a in qa["answers"]],
                    "answer_start": [a["answer_start"] for a in qa["answers"]]
                }
            })

In [None]:
predictions_original

In [None]:
results = metric.compute(predictions=predictions_original, references=references_original)

print("📊 Evaluation results:")
print(f"Exact Match: {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

In [None]:
results = metric.compute(predictions=predictions, references=references)
print("📊 Evaluation results:")
print(f"Exact Match: {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

In [None]:
!mv outputs outputs_bert
!zip -r outputs_bert.zip outputs_bert

In [None]:
shutil.move("outputs_bert.zip", "/content/drive/MyDrive/Thesis_QA_Optimization/Model")

In [None]:
import os
import shutil

# Define the full path to the destination directory
destination_dir = '/content/drive/MyDrive/Thesis_QA_Optimization/Model'
source_file = 'outputs_bert.zip' # This is the file you want to move

# 1. Check if the directory exists and create it if it doesn't
# The `exist_ok=True` argument prevents an error if the directory already exists.
# The `os.makedirs` function creates all intermediate-level directories needed.
os.makedirs(destination_dir, exist_ok=True)

# 2. Now you can safely move the file
try:
    shutil.move(source_file, destination_dir)
    print(f"Successfully moved {source_file} to {destination_dir}")
except FileNotFoundError as e:
    # This might catch a different FileNotFoundError if the source file doesn't exist,
    # but the primary directory issue should be resolved by os.makedirs.
    print(f"Error moving file: {e}")