# Fine-tuning BERT for Question Answering with SQuAD

## Paso 1: Instalación e importación de librerías necesarias

In [1]:
# Instalación de librerías
!pip install transformers accelerate datasets trl bitsandbytes -qU


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [2]:
# Importación de librerías
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments,BertTokenizerFast
from datasets import load_dataset, load_metric

## Paso 2: Carga y preprocesamiento de datos

In [3]:
# Cargar el conjunto de datos SQuAD
dataset = load_dataset("squad")

# Tokenización de los datos utilizando BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Encuentra el índice de inicio del token
        context_start = sequence_ids.index(1)
        context_end = 1 + len([x for x in sequence_ids if x == 1])

        # Ajuste de posición
        token_start_index = inputs.char_to_token(i, start_char)
        token_end_index = inputs.char_to_token(i, end_char - 1)

        if token_start_index is None:
            token_start_index = context_start
        if token_end_index is None:
            token_end_index = context_end - 1

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    inputs.update({
        "start_positions": start_positions,
        "end_positions": end_positions
    })
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Paso 3: Configuración y carga del modelo preentrenado

In [4]:
# Cargar el modelo preentrenado BERT
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Paso 4: Fine-tuning del modelo con los datos SQuAD

In [5]:
# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    max_steps=500,
    learning_rate=2e-4,
    save_total_limit=3,
    logging_steps=10,
    output_dir="./results",
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    evaluation_strategy="steps",
    eval_steps=50,
    warmup_ratio=0.05,
)



In [6]:
# Crear el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

max_steps is given, it will override any value given in num_train_epochs


In [7]:
# Entrenar el modelo
trainer.train()



Step,Training Loss,Validation Loss
50,2.3075,2.244938
100,1.8144,1.854336
150,1.4499,1.527578
200,1.6625,1.577577
250,1.3978,1.561349
300,1.4208,1.448287
350,1.1611,1.253287
400,1.2277,1.103173
450,1.1794,1.030502
500,1.1021,1.014567


TrainOutput(global_step=500, training_loss=1.681499418258667, metrics={'train_runtime': 2993.545, 'train_samples_per_second': 2.672, 'train_steps_per_second': 0.167, 'total_flos': 1567780540416000.0, 'train_loss': 1.681499418258667, 'epoch': 0.091324200913242})

## Paso 5: Evaluación del modelo

In [8]:
# Evaluar el modelo
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 1.0145667791366577, 'eval_runtime': 225.1586, 'eval_samples_per_second': 46.945, 'eval_steps_per_second': 5.871, 'epoch': 0.091324200913242}


## Paso 6: Guardado y compartición del modelo en Hugging Face

In [15]:
from huggingface_hub import notebook_login

# Autenticarse con Hugging Face
notebook_login()

# Guardar el modelo en Hugging Face
model.push_to_hub("Fine_tuning_BERT_SQuAD", use_auth_token=True)
tokenizer.push_to_hub("Fine_tuning_BERT_SQuAD", use_auth_token=True)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CMolano/Fine_tuning_BERT_SQuAD/commit/2373f3ef32dd867ecf51e42853adf8d7f8769276', commit_message='Upload tokenizer', commit_description='', oid='2373f3ef32dd867ecf51e42853adf8d7f8769276', pr_url=None, pr_revision=None, pr_num=None)