In [1]:
!pip install -q datasets==3.2.0 bitsandbytes==0.45.1 accelerate==1.2.1 evaluate==0.4.3 transformers==4.47.1 torch==2.5.1+cu124 numpy==1.26.4 peft==0.14.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training,
)

In [3]:
dataset = load_dataset("squad", split="train[5000:10000]")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [4]:
dataset = dataset.train_test_split(test_size = 0.2).shuffle(1000)

In [5]:
model_name ="albert/albert-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name, return_offsets_mapping = True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

The primary purpose of this function is to transform raw question-answering data into a format that a machine learning model can understand and use for training. Specifically, it's preparing the training data to help the model learn how to extract the exact answer span from a given context.

Here's the intuitive breakdown:

1. **Tokenization and Preprocessing**:
   - The function takes questions and their corresponding contexts
   - It converts these texts into tokens (small pieces of text that the model can process)
   - It ensures all inputs are a consistent length (max 384 tokens)
   - It keeps track of where each token maps back to the original text (using offset mapping)

2. **Answer Position Mapping**:
   - For each example, the function tries to find the start and end positions of the answer within the tokenized context
   - It translates character-level answer positions to token-level positions
   - This is crucial because machine learning models work with token indices, not raw character positions

3. **Handling Edge Cases**:
   - If the answer doesn't fit completely in the context (after tokenization), it marks the start and end positions as 0
   - This helps the model understand when an answer is not present or gets cut off

The goal is to create training data where each example has:
- A question
- A context
- Precise start and end token positions of the answer

By doing this preprocessing, you're essentially creating a "map" for the machine learning model to learn how to extract answers from text during training.

In [6]:
def preprocess_function(examples):

    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
    )

    offsets = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    answers = examples["answers"]

    for i, offset in enumerate(offsets):

        # get the start and end of the answer in the context
        answer = answers[i]
        if not answer["answer_start"]: # If answer_start is empty
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_answers_as_char = answer["answer_start"][0]
        end_answers_as_char = len(answer["text"][0]) + start_answers_as_char

        # This distinguishes between the two sequences,
        # first seq is 0, second is 1 and special tokens are None
        sequence_ids = inputs.sequence_ids(i)

        # get the start and end of the raw context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start_in_tokens = idx

        while sequence_ids[idx] == 1:
            idx += 1
        context_end_in_tokens = idx-1

        # check if the answer is in the context
        if end_answers_as_char < offset[context_start_in_tokens][0] or start_answers_as_char > offset[context_end_in_tokens][1]:
            start_positions.append(0)
            end_positions.append(0)

        else:

            # get the start and end of the tokenized context
            idx = context_start_in_tokens
            while idx <= context_end_in_tokens and offset[idx][0] <= start_answers_as_char:
                idx += 1
            start_positions.append(idx-1)

            idx = context_end_in_tokens
            while idx >= context_start_in_tokens and offset[idx][1] >= end_answers_as_char:
                idx -= 1
            end_positions.append(idx+1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [7]:
tokenized_dataset = dataset.map(
    preprocess_function ,
    batched = True ,
    remove_columns = dataset["train"].column_names)

tokenized_dataset

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1000
    })
})

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, return_tensors = 'pt')

In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = 'bfloat16',
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
)

base_model = AutoModelForQuestionAnswering.from_pretrained(
    model_name ,
    quantization_config = quantization_config ,
)

# during training the activations are saved, use_gradient_checkpointing will reduce the number of saved activations
# it's by default True, some models doesn't support use_gradient_checkpointing like our model here ALBERT so, set it False
model = prepare_model_for_kbit_training( base_model, use_gradient_checkpointing = False )

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
target_modules = set()

for name, module in base_model.named_modules():
    if isinstance(module, torch.nn.Linear):
        # Extract the names of linear layers (e.g., 'q_lin', 'k_lin', etc.)
        module_name = name.split('.')[-1]
        target_modules.add(module_name)

target_modules = list(target_modules)
print(target_modules)

['embedding_hidden_mapping_in', 'key', 'query', 'value', 'ffn_output', 'dense', 'ffn', 'qa_outputs']


In [11]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 4,
    lora_dropout = 0.1,
    bias = 'none',
    task_type = TaskType.QUESTION_ANS ,
    target_modules = ['ffn_output', 'query', 'ffn', 'embedding_hidden_mapping_in', 'value', 'key', 'dense']
)

lora_model = get_peft_model( model , lora_config )

lora_model.print_trainable_parameters()

trainable params: 119,298 || all params: 11,213,828 || trainable%: 1.0638


In [52]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    labels_start = labels[0]
    labels_end = labels[1]
    pred_start_logits, pred_end_logits = predictions

    pred_start_ids = np.argmax( pred_start_logits, axis=1 )
    pred_end_ids = np.argmax( pred_end_logits, axis=1 )

    references, predictions = [], []

    for i in range( len( labels_start ) ):

        # select the true answer from the context
        text_label = tokenizer.decode(
            tokenized_dataset[ 'test' ][ 'input_ids' ][ i ] [ labels_start [ i ] : labels_end [ i ] +1 ] )

        references.append(
            {
                "id" : str( i ),
                "answers" : {
                    "text" : [ text_label ],
                    "answer_start" : [ labels_start[ i ] ]
                }
            }
        )

        text_pred = tokenizer.decode(
            tokenized_dataset[ 'test' ] [ 'input_ids' ] [ i ] [ pred_start_ids [ i ] : pred_end_ids[ i ] + 1 ] )

        predictions.append(
            {
                'id' : str( i ),
                'prediction_text' : text_pred ,
                'no_answer_probability' : 0.0
            }
        )

    metric = evaluate.load( "squad_v2" )
    results = metric.compute( references = references , predictions = predictions )

    return {
        "f1_score" : results[ "f1" ] ,
        "exact_match" : results[ "exact" ]
    }

In [55]:
train_args = TrainingArguments(
    output_dir = "./results",
    save_strategy = 'epoch',
    num_train_epochs = 1,
    learning_rate = 2e-4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps= 2,
    weight_decay = 0.01, # L2 regulariztion
    eval_strategy = 'epoch',
    logging_dir="./logs",
    logging_strategy = 'steps',
    logging_steps = 100,
    bf16 = True,
    load_best_model_at_end = True,
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.2,
    max_grad_norm = 1.0,
    group_by_length = True,
    metric_for_best_model="eval_f1_score",
)

In [57]:
#trainer
trainer = Trainer(
    model = lora_model,
    args = train_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    data_collator = data_collator,
    processing_class = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1 Score,Exact Match
1,1.8784,1.6971,61.992077,51.0


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=125, training_loss=1.8174936828613282, metrics={'train_runtime': 338.5251, 'train_samples_per_second': 11.816, 'train_steps_per_second': 0.369, 'total_flos': 28639549976832.0, 'train_loss': 1.8174936828613282, 'epoch': 1.0})

In [None]:
# model.save_pretrained("./model_weights")

In [58]:
def predict_function(model, text , question):

    inputs = tokenizer(question,
                       text,
                       padding=True,
                       truncation='only_second',
                       return_tensors='pt')

    inputs = inputs.to(model.device)

    model.eval()

    with torch.no_grad():

        logits = model(**inputs)
        start = torch.argmax(logits.start_logits, axis = -1)
        end = torch.argmax(logits.end_logits, axis = -1)

    return tokenizer.decode(inputs['input_ids'][0][start[0]:end[0]+1])

In [70]:
text = "The sunset painted the sky orange and pink as a breeze rustled the leaves."
question = 'What colors were mentioned in the description of the sunset?'

predict_function(model, text, question)

'orange and pink'