In [18]:
!pip install -q transformers datasets accelerate peft pillow bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
import os
import torch
from huggingface_hub import login
from dotenv import load_dotenv
from datasets import load_dataset

from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    TrainingArguments,
    Trainer
)

load_dotenv()
login(token=os.getenv("HF_TOKEN"))

import torch
from datasets import load_dataset
from transformers import (
    PaliGemmaForConditionalGeneration,
    PaliGemmaProcessor,
    Trainer,
    TrainingArguments
)
from PIL import Image

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [33]:
DATASET_PATH = "person-weapon-detection-final-1/dataset/_annotations.train.jsonl"
IMAGE_FOLDER = "person-weapon-detection-final-1/dataset/"
MODEL_ID = "google/paligemma2-3b-pt-224"


In [23]:
model = PaliGemmaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

processor = PaliGemmaProcessor.from_pretrained(MODEL_ID)

model.train()


PaliGemmaForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(256, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features

In [34]:
{
    "image": "image1.jpg",
    "prefix": "detect weapon",
    "suffix": "weapon"
}

{'image': 'image1.jpg', 'prefix': 'detect weapon', 'suffix': 'weapon'}

In [35]:
dataset = load_dataset(
    "json",
    data_files={"train": DATASET_PATH}
)["train"]

print(dataset[0])

{'image': 'aug_005426_jpg.rf.36d7b8be0b99aedc397f58929b9b1b46.jpg', 'prefix': 'detect person ; weapon', 'suffix': '<loc0425><loc0237><loc1023><loc0645> person ; <loc0227><loc0492><loc0800><loc0770> person ; <loc0209><loc0697><loc0542><loc0849> person ; <loc0822><loc0301><loc1023><loc0404> weapon'}


In [40]:
def collate_fn(batch):
    images = []
    texts = []

    for example in batch:
        image_path = os.path.join(IMAGE_FOLDER, example["image"])
        image = Image.open(image_path).convert("RGB")

        # spojíme prompt + odpoveď
        full_text = example["prefix"] + " " + example["suffix"]

        images.append(image)
        texts.append(full_text)

    model_inputs = processor(
        images=images,
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    labels = model_inputs["input_ids"].clone()

    # nájdeme kde končí prompt a začína odpoveď
    for i, example in enumerate(batch):
        prompt = example["prefix"]

        prompt_ids = processor.tokenizer(
            prompt,
            truncation=True,
            return_tensors="pt"
        )["input_ids"][0]

        prompt_length = len(prompt_ids)

        labels[i, :prompt_length] = -100

    model_inputs["labels"] = labels

    return model_inputs


In [41]:
training_args = TrainingArguments(
    output_dir="./paligemma2_3d-224-weapons-ft",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    bf16=True,
    remove_unused_columns=False,
    report_to="none"
)


In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

In [42]:
trainer.train()

AttributeError: `AcceleratorState` object has no attribute `distributed_type`. This happens if `AcceleratorState._reset_state()` was called and an `Accelerator` or `PartialState` was not reinitialized.