# [HF Reference](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Idefics2/Fine_tune_Idefics2_for_JSON_extraction_use_cases_(PyTorch_Lightning).ipynb)
- In this notebook, we'll use `LoRA` instead of `QLoRA`.
- Due to VRAM usage, running this notebook as-is requires A100 GPU
    - You might be able to run this in an AWS Instace like `ml.g5.2xlarge 24GB VRAM`, if you adopt additional tactics to reduce memory usage such as `graident_checkpointing` or use `QLoRA`

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate datasets peft bitsandbytes
!pip install wandb -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install --upgrade peft
!pip install --upgrade transformers



In [None]:
import torch

if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available")

Number of GPUs available: 1
GPU 0: NVIDIA A100-SXM4-40GB


In [None]:
import wandb
import torch
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration

In [None]:
DEVICE = "cuda:0"
USE_LORA = True
USE_QLORA = False

I'm gonna use different model accroding to the type of data. If a dataset requires more lengthy conversation, I'm going to use a model tuned for that purpose. You can choose a variety of models in Hugging Face. What matters is to prepare your own datset and adjust `data collator`

In [None]:
# v1: multiple queries per image for each item
# v2: per image, only one query which includes all the items such as part_number, materials, ...
dataset_version = "v1"

# Model

In [None]:
# 8b-chatty is tuned further from 8b to improve the ability in long sequence
model_name = "HuggingFaceM4/idefics2-8b" if dataset_version == "v1" else "HuggingFaceM4/idefics2-8b-chatty",

In [None]:
# Processor prepares your data to be transferred to model
processor = AutoProcessor.from_pretrained(
    model_name[0],
    do_image_splitting=False  # Concatenates the original and the 4 pieces of splitted images
                              # Activate for tasks involving sophisticated OCR, and thus higher resolution
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.


In [None]:
args_lora = dict(
    r=8,               # Rank. Decomposition level
    lora_alpha=8,      # Scale. Extent of LoRA adapter to original weights
    lora_dropout=0.1,
    # Modules approximiated by LoRA
    target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
    use_dora=False if USE_QLORA else True,
    init_lora_weights="gaussian"
)

In [None]:
from peft import get_peft_model

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(**args_lora)
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    model = Idefics2ForConditionalGeneration.from_pretrained(
        model_name[0],
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
    )
    model = get_peft_model(model, lora_config)

else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        model_name[0],
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2", # This works for A100 or H100
    ).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face
train_dataset = load_dataset(f"nilx21/VLM_benchmark_{dataset_version}", split="train", token="<your_token_key>")
eval_dataset = load_dataset(f"nilx21/VLM_benchmark_{dataset_version}", split="test", token="<your_token_key>")

In [None]:
len(train_dataset)

1970

In [None]:
from PIL import Image
import io

# DataCollator
class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            image = Image.open(io.BytesIO(example["image"]))   # PIL.JPEG Image. Does mine match with this?
            question = example["query"]  # Change this part. Maybe I need to change the format of dataset little bit
            answer = example["answer"]  # Answer is given as a list of possible answers.
            messages = [
                {
                    "role": "user",
                    "content": [
                        # {"type": "text", "text": "Answer briefly."},
                        {"type": "text", "text": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."},
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image])

        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch


# DataCollator
data_collator = MyDataCollator(processor)

In [None]:
from transformers import TrainingArguments, Trainer

## Trainer arguments

In [None]:
eps = 5

In [None]:
args = dict(
    num_train_epochs=eps,
    # max_steps=60,  # If both train_epochs and steps exist, steps override.
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # => Pseudo-Batch = B_per_device * num_device * accumulation_step
    # gradient_checkpointing=True,  # This is not compatible base setting of `model.config.use_cache=True`
    warmup_steps=50,
    learning_rate=1.0e-4,
    weight_decay=0.01,
    logging_steps=5,
    output_dir=f"/contents",  #adapters/adapter_{dataset_version}_cropped_eps{eps}_steps0",
    eval_strategy="epoch",
    # eval_strategy="steps",
    # eval_steps=2,
    save_strategy="steps",
    save_steps=300,
    save_total_limit=1,
    fp16=True,
    remove_unused_columns=False,
    # report_to="none",
    report_to="wandb",
    )

In [None]:
training_args = TrainingArguments(**args)

### Supplement args to be reported to Wandb

In [None]:
# Wandb setting
eps = 0 if args.get('max_steps') else args['num_train_epochs']
steps = args.get('max_steps') if args.get('max_steps') else 0

user = None  #
project = '<your_project_name>'
display_name = "<run_name>"
print(f"run name: {project}/{display_name}")

# As this items not included in training_arguments
args['use_lora'] = USE_LORA
args['use_qlora'] = USE_QLORA,

args['lora_config'] = lora_config

In [None]:
import wandb
# wandb.finish()

In [None]:
!wandb login '<your_key>'

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
wandb.init(project=project, name=display_name, config=args, reinit=True)
trainer.train()