In [None]:
!pip install -q transformers sentence-transformers torchvision

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pandas as pd
import torch
from tqdm import tqdm
import os

In [None]:
import zipfile
import os

zip_path = "/content/data.zip"
extract_path = "/content/data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Dosya çıkarıldı.")

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

!pip uninstall -y wandb

In [None]:
import os
import pandas as pd
from PIL import Image

train_csv = pd.read_csv("/content/data/train.csv")
train_folder = "/content/data/train/train"

# Append .jpg if not present
train_csv['image_id'] = train_csv['image_id'].astype(str)
train_csv['image_id'] = train_csv['image_id'].apply(lambda x: x if x.endswith(".jpg") else x + ".jpg")


In [None]:
from torch.utils.data import Dataset
from transformers import BlipProcessor

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

class ImageCaptionDataset(Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image_id'])
        caption = row['caption']

        image = Image.open(image_path).convert("RGB")
        encoding = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True)

        # Remove batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        return encoding


In [None]:
from transformers import BlipForConditionalGeneration

model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

dataset = ImageCaptionDataset(train_csv, train_folder, processor)

training_args = TrainingArguments(
    output_dir="./blip-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    fp16=False,
    remove_unused_columns=False,
)

def collate_fn(batch):
    input_ids = torch.stack([example["input_ids"] for example in batch])
    attention_mask = torch.stack([example["attention_mask"] for example in batch])
    pixel_values = torch.stack([example["pixel_values"] for example in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": input_ids,  # important for captioning loss
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
    data_collator=collate_fn,
)


In [None]:
trainer.train()

In [None]:
model.save_pretrained("/content/6-epoch")
processor.save_pretrained("/content/6-epoch")