# Fine-tuning SmolVLM on S1k-1.1 using Consumer GPU with QLoRA

Finetuning ran on a single A100.

## Setup & Imports

In [None]:
!pip install -q accelerate datasets peft bitsandbytes tensorboard trl

In [None]:
!pip install -q flash-attn --no-build-isolation

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## Finetuning setup

In [None]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig, Idefics3ForConditionalGeneration

model_id = "HuggingFaceTB/SmolVLM-Base"

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=False,
    init_lora_weights="gaussian"
)
lora_config.inference_mode = False

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    _attn_implementation="flash_attention_2",
    device_map="auto"
)

for param in model.model.vision_model.parameters():
    param.requires_grad = False

model.add_adapter(lora_config)
model.enable_adapters()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

print(model.get_nb_trainable_parameters())

## Loading the dataset

In [None]:
from datasets import load_dataset
ds = load_dataset('simplescaling/s1K-1.1_tokenized', trust_remote_code=True)

In [None]:
train_ds = ds["train"]

In [None]:
train_ds

In [None]:
instruction_template = "<|im_start|>user"
response_template = "<|im_start|>assistant\n"
tokenizer.pad_token = "<|fim_pad|>"

collator = trl.DataCollatorForCompletionOnlyLM(
    instruction_template=instruction_template,
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

## Training

We can now initialize `Trainer` and initialize `TrainingArguments` to pass to `Trainer`.

In [None]:
from transformers import TrainingArguments, Trainer

model_name = model_id.split("/")[-1]

training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,
    optim="paged_adamw_8bit",
    bf16=True, 
    output_dir=f"./{model_name}-s1k-1.1",
    hub_model_id=f"{model_name}-s1k-1.1",
    report_to="tensorboard",
    remove_unused_columns=False,
    gradient_checkpointing=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=train_ds
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()