In [1]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, TrainingArguments
from pyreft import ReftConfig, LoreftIntervention, get_reft_model, ReftTrainerForCausalLM
from utils import make_last_position_supervised_data_module

# load model
model = AutoModelForCausalLM.from_pretrained("./Ovis2-4b",
                                             torch_dtype=torch.bfloat16,
                                             multimodal_max_length=8192,
                                             trust_remote_code=True)
model.config.padding_side = "left"
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
conversation_formatter = model.get_conversation_formatter()


SYS_PROMPT="<image>%s"

nnsight is not detected. Please install via 'pip install nnsight' for nnsight backend.


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
cfg = ReftConfig(
    representations={
        "component": "llm.model.layers[0].mlp.output",
        "low_rank_dimension": 64,
        "intervention":LoreftIntervention(
            embed_dim=2048,
            low_rank_dimension=4
        )
    }
)
reft_model = get_reft_model(model, cfg)
reft_model.set_device('cuda')
reft_model.print_trainable_parameters()

Intervention key: comp_llm_model_layers[0]_mlp_output_unit_pos_nunit_1#0
trainable intervention params: 16,388 || trainable model params: 0
model params: 4,304,941,558 || trainable%: 0.00038067880316622875


In [4]:
training_examples = [
    ["dataset/American-Water.png", "Whats the closest object in the image? Describe how it looks.", "🌊🧑‍🚒🚒🔧  🇺🇸🎨"],
    ["dataset/beach_dog.jpg", "Whats the animal in the image and what is it doing?", "🐶🌊🏖️🏃‍♂️🟠👄"],
    ["dataset/ClockTower.png", "Describe the buildings in picture.", "🕰️🗼🏰✨🏢🌆"],
    ["dataset/Elephants.png", "Which animal is in the picture and how many?", "🐘🐘🐘👶❤️ 🌳🌿🚶‍♂️"],
    ["dataset/fox.jpg", "What's the outfit of the player and which sports? Whats the player doing?", "🏀👕⚪👑➄ ✋🟠👀👥"],
    ["dataset/lacrose.jpg", "Which Sport is this? Whats written on the shirt?", "🥍🏃‍♂️💨👕🔴⚪🏙️"],
    ["dataset/Sea-Surfing.png", "Describe the whole scene", "🌊🌥️🧍‍♂️🏄‍♂️🛶🎡🎠🎢"],
    ["dataset/test2.jpg", "Why is cat dressed up like that?", "🐱💰💵💎📿🕴️🍣✨"],
    ["dataset/Dog-Red-Bucket.png", "What is the dog doing?", "🐕🖤 🔴🪣👄🏃💨"]
]

data_module = make_last_position_supervised_data_module(
    tokenizer=text_tokenizer, model=model, 
    images=[Image.open(s[0]) for s in training_examples],
    inputs=[SYS_PROMPT%s[1] for s in training_examples],
    outputs=[s[2] for s in training_examples]
)

In [5]:
from pyreft import ReftTrainerForCausalLM
import pyvene as pv
class MultiModalReftTrainerForCausalLM(ReftTrainerForCausalLM):
    def compute_loss(
        self,
        intervenable: pv.IntervenableModel,
        inputs,
        return_outputs=False,
        **kwargs
    ):
        # run intervened forward pass
        unit_locations = None
        if "intervention_locations" in inputs:
            if inputs["intervention_locations"].dim() == 3:
                unit_locations={"sources->base": (
                    None,
                    inputs["intervention_locations"].permute(1, 0, 2).tolist()
                )}
            else:
                # this is dummy for lora only baseline
                unit_locations={"sources->base": (None, 0)}
        base_outputs, cf_outputs = intervenable(
            {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
                "pixel_values": inputs["pixel_values"] # For VLM
            },
            unit_locations=unit_locations,
            labels=inputs["labels"],
            subspaces=inputs["subspaces"].permute(1, 0, 2).tolist() if "subspaces" in inputs else None
        )
        # return
        output = cf_outputs
        if cf_outputs is None:
            output = base_outputs # in case of lora only training

        return (output, output) if return_outputs else output.loss

In [6]:
training_args = TrainingArguments(
    num_train_epochs=100.0, output_dir="./tmp", per_device_train_batch_size=10, 
    learning_rate=4e-3, logging_steps=20, report_to="none")
trainer = MultiModalReftTrainerForCausalLM(
    model=reft_model, tokenizer=text_tokenizer, args=training_args, **data_module)
_ = trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.51 GiB. GPU 0 has a total capacity of 23.64 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 21.46 GiB memory in use. Of the allocated memory 19.90 GiB is allocated by PyTorch, and 1.06 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)