In [13]:
# !pip install transformers


In [14]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import torchvision.transforms as transforms


In [None]:

# model_path = "kaggle/working/satellite-caption-model"  # Change if needed
model_path = "nlpconnect/vit-gpt2-image-captioning"
model = VisionEncoderDecoderModel.from_pretrained(model_path)
processor = ViTImageProcessor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = "[PAD]"
tokenizer.add_tokens(["[PAD]"])
model.config.pad_token_id = tokenizer.convert_tokens_to_ids("[PAD]")



In [16]:


# Image pre-processing transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

# Load and process test image
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = transform(image).unsqueeze(0)
    return pixel_values

# Caption generation
def generate_caption(image_path):
    pixel_values = load_image(image_path)

    # Create attention mask: 1 for actual data, 0 for padding (none here, but required)
    attention_mask = torch.ones(pixel_values.shape[:2], dtype=torch.long)  # shape: (batch_size, num_pixels)

    with torch.no_grad():
        output_ids = model.generate(
            pixel_values,
            max_length=64,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            attention_mask=attention_mask  # from previous fix
        )
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption


In [18]:

# Example usage
test_image_path = "./parking_255.jpg"  # Replace with your actual test image path
caption = generate_caption(test_image_path)
print("Generated Caption:", caption)


Generated Caption: vintage cars line up on a street 
