In [None]:
# Import Libraries
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

In [None]:
# Define GPU
device_gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Define Variable
name_model = "nlpconnect/vit-gpt2-image-captioning"
max_length = 16
num_beams = 4
gen_kwargs = {"max_length" : max_length, "num_beams" : num_beams}

In [None]:
# Load Model & Feature Extraction & Tokenize
model = VisionEncoderDecoderModel.from_pretrained(name_model)
model = model.to(device_gpu)
feature_extraction = ViTImageProcessor.from_pretrained(name_model)
tokenizer = AutoTokenizer.from_pretrained(name_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.4

In [None]:
"""
Steps :
  1. load the image
  2. check the image is RGB or not
  3. feature extraction for image
  4. call the generate function to build the output "caption"
  5. decode the output using tokenizer
  6. return the output
"""

# create function to predict
def predict_step(image_path):
  # 1. load the image
  image = Image.open(image_path)

  # 2. check the image is RGB or not " if RGB insert the image in list - else convert the image to RGB then inset the image in list"
  if image.mode != "RGB":
    image = image.convert(mode = "RGB")

  # 3. feature extraction for image
  pixel_value = feature_extraction(images = image, return_tensors = "pt").pixel_values
  pixel_value = pixel_value.to(device = device_gpu)

  # 4. call the generate function to build the output "caption"
  output = model.generate(pixel_value, **gen_kwargs)

  # 5. decode the output using tokenizer
  predicted_caption = tokenizer.batch_decode(output, skip_special_tokens = True)
  predicted_caption = [pred.strip() for pred in predicted_caption]

  # 6. return the output
  return predicted_caption[0]

In [None]:
image_path = "/content/image_1.jpg"
predicted_caption = predict_step(image_path)
print(predicted_caption)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


children sitting around a table with a cake


In [None]:
image_path = "/content/image_2.jpg"
predicted_caption = predict_step(image_path)
print(predicted_caption)

a woman standing in front of a grocery store filled with fresh produce


In [None]:
image_path = "/content/image_3.jpg"
predicted_caption = predict_step(image_path)
print(predicted_caption)

men playing a game of soccer


In [None]:
image_path = "/content/image_4.jpg"
predicted_caption = predict_step(image_path)
print(predicted_caption)

people are playing frisbee in a field
