In [3]:
import torch
from PIL import Image
import open_clip
import os

In [4]:
name = "ViT-B-16" #-plus-240
pretrained = "laion400m_e32"
model, train_transform, eval_transform = open_clip.create_model_and_transforms(name, pretrained=pretrained)

In [5]:
from pathlib import Path

ouptut_dir = "openvino_fp32"
if not os.path.exists(ouptut_dir):
    os.makedirs(ouptut_dir)
    
with open(Path(ouptut_dir) / "model_index.txt", 'w') as fd:
    fd.write(f"{name},{pretrained}\n")

In [6]:
tokenizer = open_clip.get_tokenizer(name)

image = eval_transform(Image.open("../../docs/CLIP.png")).unsqueeze(0)
text = tokenizer("a cat")

In [7]:
torch.onnx.export(
    model.visual,
    image,
    "image_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["image"],  # the model's input names
    output_names=["image_embedding"],  # the model's output names
    dynamic_axes={  # variable length axes
        "image": {0: "batch"},
        "image_embedding": {0: "batch"},
    }
)

In [8]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_encoder = convert_model("image_encoder.onnx")
ov.serialize(ov_encoder, ouptut_dir + "/image_encoder.xml")

In [9]:
class TextTransformerExportWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, text):
        x = self.model.token_embedding(text)
        x = x + self.model.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.model.transformer(x, attn_mask=self.model.attn_mask)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.model.ln_final(x)  # [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.model.text_projection
        return x

transformer_export_wrapper = TextTransformerExportWrapper(model)

In [10]:
torch.onnx.export(
    transformer_export_wrapper,
    text,
    "text_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input_ids"],  # the model's input names
    output_names=["text_embeds"],  # the model's output names
    dynamic_axes={  # variable length axes
        "input_ids": {0: "batch"}
    }
)



In [11]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_transformer = convert_model("text_encoder.onnx")
ov.serialize(ov_transformer, ouptut_dir + "/text_encoder.xml")



In [12]:
!benchmark_app -m image_encoder.onnx -shape "image[1,3,240,240]" -api sync

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.0-10862-40bf400b189
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.0-10862-40bf400b189
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 380.48 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     image (node: image) : f32 / [...] / [?,3,224,224]
[ INFO ] Model outputs:
[ INFO ]     image_embedding (node: image_embedding) : f32 / [...] / [?,512]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'image': [1,3,240,240]
[ ERROR ] Check 'T::broadcast_merge_into(output_shape, input_shapes[1], autob)' failed at src/core/shape_inference/include/utils.hpp:42:

In [13]:
!benchmark_app -m text_encoder.onnx -shape "input_ids[1,77]" -api sync

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.0-10862-40bf400b189
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.0-10862-40bf400b189
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 373.38 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,77]
[ INFO ] Model outputs:
[ INFO ]     text_embeds (node: text_embeds) : f32 / [...] / [?,512]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input_ids': [1,77]
[ INFO ] Reshape model took 48.29 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : 

In [None]:
torch.onnx.export(
    model,
    (image, text),
    "model.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["image", "text"],  # the model's input names
    output_names=["image_embedding"],  # the model's output names
    dynamic_axes={  # variable length axes
        "image": {0: "batch"},
        "text": {0: "batch"},
    }
)

In [None]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_transformer = convert_model("model.onnx")
ov.serialize(ov_transformer, "model.xml")

In [None]:
!benchmark_app -m model.xml -shape "image[1,3,240,240],text[1,77]" -api sync