In [1]:
import pandas as pd
from PIL import Image
import os

In [3]:


system_message = "You are an expert in marine taxonomy, especially marine fishes."

with open("prompts/user_prompt.txt", "r") as f:
    user_prompt = f.read()

def load_dataset_from_csv(csv_path, img_base_dir=None):
    df = pd.read_csv(csv_path)
    
    dataset = []
    for _, row in df.iterrows():
        marine_class = row["Class"]
        image_file = row["Image"]

        class_name = marine_class.split(".")[0]

        img_path = f"{img_base_dir}/{marine_class}/{image_file}"
        
        if not os.path.exists(img_path):
            print(f"Warning: Image file not found: {img_path}")
            continue
        
        try:
            image = Image.open(img_path)
            formatted_sample = format_data(image, row['Description'], class_name)
            dataset.append(formatted_sample)
        except Exception as e:
            print(f"Error processing image {img_path}: {e}")
    
    return dataset

def format_data(image, caption, class_name):
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_prompt,
                    },
                    {
                        "type": "image",
                        "image": image,
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": f"{caption}\nClass detected:{class_name}"}],
            },
        ],
    }

def process_vision_info(messages):
    image_inputs = []
    for msg in messages:
        content = msg.get("content", [])
        if not isinstance(content, list):
            content = [content]

        for element in content:
            if isinstance(element, dict) and (
                "image" in element or element.get("type") == "image"
            ):
                if "image" in element:
                    image = element["image"]
                else:
                    image = element
                image_inputs.append(image.convert("RGB"))
    return image_inputs

In [4]:
csv_path = "text_data/filename2features.csv"
img_base_dir = "/home/reshma/Otolith/captioning/otolith/model_data/train"
dataset = load_dataset_from_csv(csv_path, img_base_dir)


In [5]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

# Hugging Face model id
model_id = "google/gemma-3-4b-pt" # or `google/gemma-3-12b-pt`, `google/gemma-3-27-pt`

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] < 8:
    raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch.bfloat16, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)

# Load model and tokenizer
model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

ModuleNotFoundError: Could not import module 'AutoProcessor'. Are this object's requirements defined correctly?

In [5]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

In [7]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-otolith",     # directory to save and repository id
    num_train_epochs=100,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=5e-5,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.1,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    push_to_hub=True,                           # push model to hub
    report_to="tensorboard",                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator
)
args.remove_unused_columns = False # important for collator

# Create a data collator to encode text and image pairs
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image_inputs = process_vision_info(example["messages"])
        text = processor.apply_chat_template(
            example["messages"], add_generation_prompt=False, tokenize=False
        )
        texts.append(text.strip())
        images.append(image_inputs)

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
    labels = batch["input_ids"].clone()

    # Mask image tokens
    image_token_id = [
        processor.tokenizer.convert_tokens_to_ids(
            processor.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    # Mask tokens for not being used in the loss computation
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100

    batch["labels"] = labels
    return batch

In [7]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

In [9]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [8]:
from peft import PeftModel

# Load Model base model
model = AutoModelForImageTextToText.from_pretrained(model_id, low_cpu_mem_usage=True)

# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(model, args.output_dir)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")

processor = AutoProcessor.from_pretrained(args.output_dir)
processor.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

['merged_model/processor_config.json']

In [5]:
# Load model/processor
model = AutoModelForImageTextToText.from_pretrained("merged_model")
processor = AutoProcessor.from_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
import torch

# Load Model with PEFT adapter
model = AutoModelForImageTextToText.from_pretrained(
  args.output_dir,
  device_map="auto",
  torch_dtype=torch.bfloat16,
  attn_implementation="eager",
)
processor = AutoProcessor.from_pretrained(args.output_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
import requests
from PIL import Image

# Test sample with Product Name, Category and Image
sample = {
  "image": Image.open("/home/reshma/Otolith/captioning/otolith/model_data/test/Alepocephalus bicolor/ab3980518.tif").convert("RGB")
}

def generate_description(sample, model, processor):
    # Convert sample into messages and then apply the chat template
    messages = [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        {"role": "user", "content": [
            {"type": "text", "text": user_prompt},
            {"type": "image","image": sample["image"]},
        ]},
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    # Process the image and text
    image_inputs = process_vision_info(messages)
    # Tokenize the text and process the images
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )
    # Move the inputs to the device
    inputs = inputs.to(model.device)
    
    # Generate the output
    stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
    generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8, eos_token_id=stop_token_ids, disable_compile=True)
    # Trim the generation and decode the output to text
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]

# generate the description
description = generate_description(sample, model, processor)
print(description)

Type: Sagittal. Side: Right otolith. Shape: Triangular, sinuate to crenate dorsal and irregular ventral margin. Sulcus acusticus: heterosulcoid, ostial, median. Ostium: Funnel-like, longer than cauda, Cauda: tubular, straight. anterior region: double-peaked, Excisura wide with moderately wide and deep notch, rostrum longer than antirostrum, antirostrum present or poorly defined. Posterior region: Irregular
Class detected:Alepocephalus bicolor


In [None]:

Shape: Triangular to Spindle-shaped, sinuate to entire dorsal and sinuate to entire ventral
margins, Sulcus acusticus: heterosulcoid, ostial, median. Ostium: tubular, longer than cauda.
Cauda: tubular, slightly curved. Anterior region: Blunt, rostrum and antirostrum poorly defined,
excisura and notch poorly defined. Posterior region: peaked.
Class detected:Grammoplites suppositus

Shape: elliptic, sinuate to crenate dorsal and ventral margins. Sulcus acusticus: heterosulcoid,ostial, median. Ostium: tubular. Cauda: tubular, slightly curved. Anterior region: angled to irregular, rostrum and antirostrum poorly defined. Posterior region: angled to irregular.