In [1]:
!pip install wandb
import wandb
wandb.login()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmadsait[0m ([33mahmadsait-king-abdullah-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [1]:
!pip install transformers datasets accelerate torchvision pandas



In [2]:
import pandas as pd
from PIL import Image
from datasets import load_dataset, Dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
import torch
import unicodedata

In [1]:
df = pd.read_csv("artelingo/dataset/arabic/train/artemis_preprocessed.csv")

blip_df = df[["image_file", "utterance_spelled"]].rename(columns={
    "image_file": "image",
    "utterance_spelled": "caption"
})

blip_df["image"] = blip_df["image"].apply(lambda x: x.replace("wikiart", "artelingo/dataset/wikiart"))

blip_df.to_csv("blip_caption_data_artelingo.csv", index=False, encoding="utf-8-sig")

In [3]:
df = pd.read_csv("blip_caption_data_artelingo.csv", encoding="utf-8-sig")
df["image"] = df["image"].apply(lambda x: unicodedata.normalize("NFC", x))
dataset = Dataset.from_pandas(df)
from PIL import Image
from tqdm import tqdm
import os
import json

broken_indices = []
MAX_PIXELS = 89_478_485

for i in tqdm(range(len(dataset)), desc="Checking images"):
    path = dataset[i]["image"]
    if not os.path.exists(path):
        broken_indices.append(i)
        continue
    try:
        with Image.open(path) as img:
            if img.width * img.height > MAX_PIXELS:
                print(f"[Skipped] {path} exceeds safe pixel size.")
                broken_indices.append(i)
                continue
            img = img.convert("RGB").copy()
    except Exception as e:
        print(f"Bad image at {i}: {path}, reason: {e}")
        broken_indices.append(i)

with open("broken_image_indices.json", "w") as f:
    json.dump(broken_indices, f)

with open("broken_image_indices.json") as f:
    broken_indices = set(json.load(f))

dataset = dataset.filter(lambda example, idx: idx not in broken_indices, with_indices=True)


Checking images:  12%|█▏        | 40167/332163 [04:58<38:45, 125.55it/s]

[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_uriel-1955.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_uriel-1955.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_uriel-1955.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_uriel-1955.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_uriel-1955.jpg exceeds safe pixel size.


Checking images:  49%|████▉     | 163701/332163 [22:08<20:42, 135.53it/s]

[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.
[Skipped] artelingo/dataset/wikiart/Color_Field_Painting/barnett-newman_vir-heroicus-sublimis-1950.jpg exceeds safe pixel size.


Checking images: 100%|██████████| 332163/332163 [46:21<00:00, 119.42it/s] 


Filter:   0%|          | 0/332163 [00:00<?, ? examples/s]

In [4]:

df_filtered = dataset.to_pandas()

df_filtered.to_csv("filtered_blip_caption_data_artelingo.csv", index=False, encoding="utf-8-sig")

In [11]:
df = pd.read_csv("filtered_blip_caption_data_artelingo.csv", encoding="utf-8-sig")
print(len(df))
df["image"] = df["image"].apply(lambda x: unicodedata.normalize("NFC", x))

332152


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

In [14]:
!nproc

3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
from torch.utils.data import Dataset
from PIL import Image

class BlipOnTheFlyDataset(Dataset):
    def __init__(self, df, processor):
        self.data = df.reset_index(drop=True)
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.loc[idx]
        image_path = row["image"]
        caption = row["caption"]
    
        try:
            image = Image.open(image_path).convert("RGB")
        except:
            image = Image.new("RGB", (224, 224))
    
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )
    
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": inputs["input_ids"].squeeze(0),
        }

        


In [16]:
dataset = BlipOnTheFlyDataset(df, processor)

In [None]:
def blip_data_collator(features):
    return {
        "pixel_values": torch.stack([f["pixel_values"] for f in features]),
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels": torch.stack([f["labels"] for f in features]),
    }



training_args = TrainingArguments(
    output_dir="./blip-finetuned-artelingo",       
    per_device_train_batch_size=32,
    num_train_epochs=5,
    save_strategy="epoch",                              
    logging_steps=50,
    save_total_limit=5,                     
    fp16=True,
    disable_tqdm=False,
    report_to="wandb",                      
    run_name="blip-artelingo"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=blip_data_collator
)

trainer.train()
trainer.save_model("./blip-finetuned-artelingo")


  trainer = Trainer(


Step,Training Loss
50,0.7746
100,0.326
150,0.291
200,0.2629
250,0.2526
300,0.237
350,0.2228
400,0.2177
450,0.2088
500,0.205


In [17]:
def blip_data_collator(features):
    return {
        "pixel_values": torch.stack([f["pixel_values"] for f in features]),
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels": torch.stack([f["labels"] for f in features]),
    }



training_args = TrainingArguments(
    output_dir="./blip-finetuned-artelingo",         
    per_device_train_batch_size=32,
    num_train_epochs=5,
    save_strategy="epoch",                                     
    logging_steps=50,
    save_total_limit=5,                         
    fp16=True,
    disable_tqdm=False,
    report_to="wandb",                          
    run_name="blip-artelingo-resumed-epoch2"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=blip_data_collator
)

trainer.train(resume_from_checkpoint="./blip-finetuned-artelingo/checkpoint-20760")

  trainer = Trainer(
There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.bias'].


Step,Training Loss
20800,0.1072
20850,0.1047
20900,0.1054
20950,0.1073
21000,0.1064
21050,0.1049
21100,0.1024
21150,0.1076
21200,0.1065
21250,0.1058


TrainOutput(global_step=51900, training_loss=0.05767407359415396, metrics={'train_runtime': 38675.6469, 'train_samples_per_second': 42.941, 'train_steps_per_second': 1.342, 'total_flos': 9.855342646692309e+20, 'train_loss': 0.05767407359415396, 'epoch': 5.0})

It took 19 hours and 21 minutes to train BLIP on Artelingo captions

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [11]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import os

image_path = "../../../../ibex/ai/home/saitaa0b/wikiart/Ukiyo_e/utagawa-kuniyoshi_women-8.jpg"
image = Image.open(image_path).convert("RGB")

checkpoint_dirs = [
    "checkpoint-10380",
    "checkpoint-20760",
    "checkpoint-31140",
    "checkpoint-41520",
    "checkpoint-51900"
]

for ckpt in checkpoint_dirs:
    model_path = os.path.join("blip-finetuned-artelingo", ckpt)
    model = BlipForConditionalGeneration.from_pretrained(model_path).to("cuda")
    model.eval()

    inputs = processor(images=image, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(**inputs, max_length=128)
        caption = processor.tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"[{ckpt}] Caption: {caption}")


[checkpoint-10380] Caption: السيده ترتدي الملابس الملونه التي تقف علي البحر وتنظر الى السفينه التي تسير بها
[checkpoint-20760] Caption: امراة ترتدي فستان انيق وتجلس علي قارب خشبي صغير وتنظر للبحر
[checkpoint-31140] Caption: امراة ترتدي فستان اسود وتجلس في قارب خشبي وتنظر الي البحر
[checkpoint-41520] Caption: امراة ترتدي فستان انيق وتجلس في قارب خشبي صغير وتنظر الي البحر
[checkpoint-51900] Caption: امراة ترتدي فستان انيق وتجلس في قارب خشبي صغير وتنظر الي البحر


epoch 3 shows the most emotion for BLIP + ArtELingo

[checkpoint-10380] Caption: امراه ترتدي ملابس لونها احمر تحمل طفله صغيره ترتدي ملابس لونها احمر تحمل طفل صغير على يدها

[checkpoint-20760] Caption: امراة تحمل طفلها الصغير وترتدي فستان احمر اللون وتنظر اليه بنظرات حب وحنان

[checkpoint-31140] Caption: امراة ترتدي ملابس بسيطة جدا وتحمل طفلها الرضيع وتنظر اليه بنظرات حب وحنان

[checkpoint-41520] Caption: امراة ترتدي ملابس بسيطة جدا وتحمل طفل رضيع لا يبلغ من العمر عاما

[checkpoint-51900] Caption: امراة ترتدي ملابس بسيطة جدا وتحمل طفل رضيع لا يبلغ من العمر عاما
