<a href="https://colab.research.google.com/github/EhsaasN/LLM-learning/blob/main/BERT_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install transformers datasets peft accelerate bitsandbytes



In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

In [3]:
model_name = "bert-base-uncased"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model = prepare_model_for_kbit_training(base_model)

In [5]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "key", "value"],  # Attention modules in BERT
    lora_dropout=0.1,
    bias="none",
    task_type="QUESTION_ANS"
)

In [6]:
model = get_peft_model(model, lora_config)

In [7]:
data = {
    "context": [
        "The S&P 500 index tracks the performance of 500 large companies listed on stock exchanges in the United States.",
        "An ETF, or Exchange-Traded Fund, is a type of investment fund that is traded on stock exchanges, much like stocks."
    ],
    "question": [
        "What does the S&P 500 index track?",
        "What is an ETF?"
    ],
    "answers": [
        {"text": ["the performance of 500 large companies"], "answer_start": [20]},
        {"text": ["a type of investment fund that is traded on stock exchanges"], "answer_start": [13]}
    ]
}

dataset = Dataset.from_dict(data)

In [8]:
def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(example["answers"]["text"][0])
    offsets = tokenized["offset_mapping"][0]

    start_token = end_token = 0
    for idx, (start, end) in enumerate(offsets):
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx

    return {
        "input_ids": tokenized["input_ids"].squeeze(),
        "attention_mask": tokenized["attention_mask"].squeeze(),
        "start_positions": torch.tensor(start_token),
        "end_positions": torch.tensor(end_token)
    }

In [9]:
tokenized_dataset = dataset.map(preprocess)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir="./bert-financial-qlora",
    per_device_train_batch_size=2,
    num_train_epochs=300,
    logging_dir="./logs",
    logging_steps=5,
    save_strategy="no"
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


BEFORE TRAINING

In [12]:
def answer_question(question, context):
    device = model.device  # detect the device model is on
    inputs = tokenizer(question, context, return_tensors="pt").to(device)  # move inputs to same device
    with torch.no_grad():
        outputs = model(**inputs)
    start = torch.argmax(outputs.start_logits)
    end = torch.argmax(outputs.end_logits) + 1
    answer_ids = inputs["input_ids"][0][start:end]
    return tokenizer.decode(answer_ids, skip_special_tokens=True)

In [13]:
test_context = "A dividend is a payment made by a corporation to its shareholders, usually in the form of cash or additional stock."
test_question = "What is a dividend?"

# === Inference BEFORE training ===
print("🔍 Testing BEFORE training")
print("Q:", test_question)
print("A:", answer_question(test_question, test_context))

🔍 Testing BEFORE training
Q: What is a dividend?
A: 


In [14]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mehsaasnahata[0m ([33mehsaasnahata-keshav-memorial-engineering-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss
5,5.7285
10,5.6635
15,5.5504
20,5.4808
25,5.4524
30,5.3404
35,5.1974
40,5.1614
45,5.0081
50,4.9139


TrainOutput(global_step=300, training_loss=2.8396994972229006, metrics={'train_runtime': 67.4103, 'train_samples_per_second': 8.901, 'train_steps_per_second': 4.45, 'total_flos': 118197196185600.0, 'train_loss': 2.8396994972229006, 'epoch': 300.0})

In [15]:
model.save_pretrained("./bert-financial-qlora")
tokenizer.save_pretrained("./bert-financial-qlora")

('./bert-financial-qlora/tokenizer_config.json',
 './bert-financial-qlora/special_tokens_map.json',
 './bert-financial-qlora/vocab.txt',
 './bert-financial-qlora/added_tokens.json',
 './bert-financial-qlora/tokenizer.json')

In [18]:
test_context = "A dividend is a payment made by a corporation to its shareholders, usually in the form of cash or additional stock."
test_question = "What is a dividend?"

print("Q:", test_question)
print("A:", answer_question(test_question, test_context))

Q: What is a dividend?
A: is a payment made by a corporation to its shareholders, usually in the form of cash or additional stock


In [17]:
model.eval()

PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): BertForQuestionAnswering(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default):