**1. Coonecting drive to import the pretrained model from huggingface**

In [1]:
from  google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
#Changing to the correct directory
cd drive/MyDrive/vit-gpt2-image-captioning/

/content/drive/.shortcut-targets-by-id/137B72_hCKohNjHDdu09k_WpmGN3H9Xml/vit-gpt2-image-captioning


**Alternatively we can git clone evrything here as well without runing the steps above**

In [None]:
!git clone https://huggingface.co/nlpconnect/vit-gpt2-image-captioning

**Installing required libraries**


In [None]:
!pip install transformers

In [None]:
!pip install gradio

**Code for generating captions on test set using pretrained model**


In [39]:
import gradio as gr #Gradio to build demo and can also help in sharing
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def predict_step(image):
    i_image = image.convert(mode="RGB")

    pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

iface = gr.Interface(
    fn=predict_step,
    inputs=gr.inputs.Image(type='pil', label='Image'),
    outputs=gr.outputs.Textbox(label='Generated Caption')
)

iface.launch()


Some weights of the model checkpoint at nlpconnect/vit-gpt2-image-captioning were not used when initializing VisionEncoderDecoderModel: ['decoder.transformer.h.6.attn.bias', 'decoder.transformer.h.8.attn.bias', 'decoder.transformer.h.2.attn.masked_bias', 'decoder.transformer.h.10.crossattention.bias', 'decoder.transformer.h.10.attn.bias', 'decoder.transformer.h.3.crossattention.bias', 'decoder.transformer.h.4.attn.masked_bias', 'decoder.transformer.h.4.attn.bias', 'decoder.transformer.h.9.attn.bias', 'decoder.transformer.h.0.attn.masked_bias', 'decoder.transformer.h.2.crossattention.masked_bias', 'decoder.transformer.h.11.crossattention.bias', 'decoder.transformer.h.1.attn.bias', 'decoder.transformer.h.1.crossattention.bias', 'decoder.transformer.h.1.attn.masked_bias', 'decoder.transformer.h.10.attn.masked_bias', 'decoder.transformer.h.7.attn.bias', 'decoder.transformer.h.2.attn.bias', 'decoder.transformer.h.3.attn.masked_bias', 'decoder.transformer.h.5.crossattention.masked_bias', 'de

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



**Code for generating Multiple Captions**

In [38]:
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 16
num_beams = 4

def predict_steps(image, num):
    i_image = image.convert(mode="RGB")

    pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
    gen_kwargs2 = {"max_length": 30, "num_beams": 2}
    gen_kwargs2["temperature"] = 0.9
    gen_kwargs3 = {"max_length": 25, "num_beams": 25}
    gen_kwargs3["temperature"] = 0.2

    preds, pred2, preds3 = "", "", ""

    if (num == "1"):
        output_ids = model.generate(pixel_values, **gen_kwargs)
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]

    elif (num == "2"):
        output_ids = model.generate(pixel_values, **gen_kwargs)
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]

        output_ids2 = model.generate(pixel_values, **gen_kwargs2)
        preds2 = tokenizer.batch_decode(output_ids2, skip_special_tokens=True)
        preds2 = [pred.strip() for pred in preds2]

    elif (num == "3"):
        output_ids = model.generate(pixel_values, **gen_kwargs)
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]

        output_ids2 = model.generate(pixel_values, **gen_kwargs2)
        preds2 = tokenizer.batch_decode(output_ids2, skip_special_tokens=True)
        preds2 = [pred.strip() for pred in preds2]

        output_ids3 = model.generate(pixel_values, **gen_kwargs3)
        preds3 = tokenizer.batch_decode(output_ids3, skip_special_tokens=True)
        preds3 = [pred.strip() for pred in preds3]

    if num == "1":
        return preds
    elif (num == "2"):
        return preds, preds2
    elif (num == "3"):
        return preds, preds2, preds3


iface = gr.Interface(
    fn=predict_steps,
    inputs=[gr.inputs.Image(type='pil', label='Image'),
            gr.inputs.Dropdown(["1", "2", "3"], label='Number of Captions')],
    outputs=gr.outputs.Textbox(label='Generated Caption')
)

iface.launch()


Some weights of the model checkpoint at nlpconnect/vit-gpt2-image-captioning were not used when initializing VisionEncoderDecoderModel: ['decoder.transformer.h.6.attn.bias', 'decoder.transformer.h.8.attn.bias', 'decoder.transformer.h.2.attn.masked_bias', 'decoder.transformer.h.10.crossattention.bias', 'decoder.transformer.h.10.attn.bias', 'decoder.transformer.h.3.crossattention.bias', 'decoder.transformer.h.4.attn.masked_bias', 'decoder.transformer.h.4.attn.bias', 'decoder.transformer.h.9.attn.bias', 'decoder.transformer.h.0.attn.masked_bias', 'decoder.transformer.h.2.crossattention.masked_bias', 'decoder.transformer.h.11.crossattention.bias', 'decoder.transformer.h.1.attn.bias', 'decoder.transformer.h.1.crossattention.bias', 'decoder.transformer.h.1.attn.masked_bias', 'decoder.transformer.h.10.attn.masked_bias', 'decoder.transformer.h.7.attn.bias', 'decoder.transformer.h.2.attn.bias', 'decoder.transformer.h.3.attn.masked_bias', 'decoder.transformer.h.5.crossattention.masked_bias', 'de

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



**Thanks a lot !!**