In [None]:
!pip install degirum openvino_tokenizers
!wget https://github.com/DeGirum/clip_degirum/blob/main/openvino_tokenizer.xml https://github.com/DeGirum/clip_degirum/blob/main/openvino_tokenizer.bin

In [6]:
import numpy as np
import degirum as dg
from openvino import compile_model
from openvino_tokenizers import convert_tokenizer

text_input = ["People Running", "People Talking", "People Laughing", "People Dancing"]
loaded_tokenizer = compile_model("openvino_tokenizer.xml")
all_tokens = loaded_tokenizer(text_input)["input_ids"]

zoo = dg.connect("@cloud", "https://cs.degirum.com/degirum/clip_dg",token="<dg_cloud_token>")
text_model = zoo.load_model("clip_textual--1x77_float_openvino_cpu_3")

padded_tokens = [np.pad(token, (0, 77 - len(token)), mode='constant', constant_values=0).astype(np.int64) for token in all_tokens]
text_features = np.array([text_model(np.reshape(token, (1,1,1,77))).results[0]["data"][0] for token in padded_tokens], dtype=np.float32)

# Initialize logit scale
logit_scale = np.exp(np.log(1 / 0.07)).astype(np.float32)
logit_scale_exp_array = np.array(logit_scale, dtype=np.float32)

# Connect to Degirum and load CLIP model
rn50_model = zoo.load_model('clip_RN50--224x224_float_n2x_orca1_1', input_image_format="RAW", input_pad_method="crop-last", image_backend="pil")
transformer_model = zoo.load_model('clip_transformer--7x7_float_openvino_cpu_1')

# Get image features
rn50_res = rn50_model("dance.jfif").results[0]["data"]
final_res = transformer_model(rn50_res).results[0]["data"]
image_features_array = np.array(final_res, dtype=np.float32).reshape(1, -1)

# Calculate logits using numpy
logits_per_image = logit_scale_exp_array * np.matmul(image_features_array, text_features.T)
logits_per_image_2d = logits_per_image.reshape(-1, logits_per_image.shape[-1])
logits_per_text = logits_per_image_2d.T

# Softmax and print results
softmax_output = np.exp(logits_per_image_2d) / np.sum(np.exp(logits_per_image_2d), axis=1, keepdims=True)
print(softmax_output)


[[6.3499589e-10 3.2255151e-08 1.0514298e-09 1.0000000e+00]]
