In [52]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer


In [53]:
df = pd.read_csv('/kaggle/input/satellite-image-caption-generation/train.csv')
image_dir = '/kaggle/input/satellite-image-caption-generation'

df.head()


Unnamed: 0,captions,filepath
0,['Many aircraft are parked next to a long buil...,train/airport_1.jpg
1,['some planes are parked in an airport.'\n 'th...,train/airport_10.jpg
2,['Many aircraft are parked in an airport near ...,train/airport_100.jpg
3,['Many aircraft are parked near a large buildi...,train/airport_101.jpg
4,['several buildings and green trees are around...,train/airport_102.jpg


In [54]:
class SatelliteDataset(Dataset):
    def __init__(self, df, image_dir, transform, tokenizer, max_length=64):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        print("DataFrame columns:", self.df.columns)


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row['filepath'])  
        caption = row['captions']                                   
    
        image = Image.open(image_path).convert('RGB')
        pixel_values = self.transform(image)
    
        tokens = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
    
        return {
            'pixel_values': pixel_values,
            'input_ids': tokens.input_ids.squeeze(),
            'attention_mask': tokens.attention_mask.squeeze()
        }



In [55]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer

# Load model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# GPT2 has no pad_token by default, so set it to eos
tokenizer.pad_token = tokenizer.eos_token

# Set required config values
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

In [56]:
dataset = SatelliteDataset(df, image_dir, transform, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

DataFrame columns: Index(['captions', 'filepath'], dtype='object')


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

model.train()
for epoch in range(10):
    for batch in dataloader:
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

Epoch 1 | Loss: 2.0692


In [None]:
model.eval()
import matplotlib.pyplot as plt

image = Image.open('/kaggle/input/satellite-image-caption-generation/test/00628.jpg').convert("RGB")
plt.imshow(image)
plt.show()
inputs = processor(images=image, return_tensors="pt").to(device)

# Génération
generated_ids = model.generate(**inputs, max_length=64)
caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# captions = caption.split("\n")
# caption = "\n".join([captions[0],captions[1]])
print(caption)

In [None]:
model.save_pretrained("satellite-caption-model-v2")
tokenizer.save_pretrained("satellite-caption-model-v2")

In [None]:
!zip -r satellite-caption-model-v2.zip /kaggle/working/satellite-caption-model-v2