**Segment 1: Installation and Import Statements**

In [1]:
!git clone https://huggingface.co/nlpconnect/vit-gpt2-image-captioning

Cloning into 'vit-gpt2-image-captioning'...
remote: Enumerating objects: 47, done.[K
remote: Total 47 (delta 0), reused 0 (delta 0), pack-reused 47[K
Unpacking objects: 100% (47/47), 758.98 KiB | 5.13 MiB/s, done.


In [1]:
!pip install transformers
!pip install gradio

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.3 MB/s[0m eta [36m0:00:0

In [None]:
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
import torch
from PIL import Image

***Segment 2: Model and Tokenizer Setup***

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_caption_length = 32
num_beams = 8
generation_kwargs = {"max_length": max_caption_length, "num_beams": num_beams}

***Segment 3: Caption Generation Function***

In [None]:
def generate_caption(image):
    rgb_image = image.convert(mode="RGB")

    pixel_values = feature_extractor(images=[rgb_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **generation_kwargs)

    captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    captions = [caption.strip() for caption in captions]
    return captions

***Segment 4: Gradio Interface Setup and Launch***

In [4]:
iface = gr.Interface(
    fn=generate_caption,
    inputs=gr.inputs.Image(type='pil', label='Image'),
    outputs=gr.outputs.Textbox(label='Generated Caption'),
    interpretation="shap",
    title="Image Caption Prediction System"

iface.launch()


  inputs=gr.inputs.Image(type='pil', label='Image'),
  inputs=gr.inputs.Image(type='pil', label='Image'),
  outputs=gr.outputs.Textbox(label='Generated Caption'),


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

