In [None]:
!pip install git+https://github.com/openai/CLIP.git degirum

In [None]:
import clip
import torch
import numpy as np
import degirum as dg
from torch import nn

text_prompts = ["People Running", "People Talking", "People Laughing", "People Dancing"]
model, preprocess = clip.load("RN50", device="cpu")
text = clip.tokenize(text_prompts).to("cpu")
text_features = model.encode_text(text)

zoo = dg.connect("@cloud", "https://cs.degirum.com/degirum/clip_dg", token="token")
dg_model = zoo.load_model('clip--224x224_float_tensorrt_gpu_1', input_image_format="RAW", input_pad_method="crop-last", image_backend="pil")

Clip demo with Torch dependencies

In [None]:
# torch
# Initialize logit scale
logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)).exp()
logit_scale_exp_tensor = torch.tensor(logit_scale.detach().numpy(), dtype=torch.float32)

# Get image features
final_res = dg_model("dance.jfif").results[0]["data"]
image_features_tensor = torch.tensor(final_res, dtype=torch.float32).unsqueeze(0)
text_features_tensor = torch.squeeze(text_features, dim=1)

# Calculate logits
logits_per_image = logit_scale_exp_tensor * torch.matmul(image_features_tensor, text_features_tensor.t())
logits_per_image_2d = logits_per_image.view(-1, logits_per_image.size(-1))
logits_per_text = logits_per_image_2d.t()

# Softmax and print results
softmax_output = torch.softmax(logits_per_image_2d, dim=-1).detach().cpu().numpy()
print(softmax_output)

Clip demo with non Torch dependencies

In [None]:
# non torch
# Initialize logit scale
logit_scale = np.exp(np.log(1 / 0.07)).astype(np.float32)
logit_scale_exp_array = np.array(logit_scale, dtype=np.float32)

# Get image features
final_res = dg_model("dance.jfif").results[0]["data"]
image_features_array = np.array(final_res, dtype=np.float32).reshape(1, -1)
text_features_array = text_features.squeeze().detach().numpy()

# Calculate logits using numpy
logits_per_image = logit_scale_exp_array * np.matmul(image_features_array, text_features_array.T)
logits_per_image_2d = logits_per_image.reshape(-1, logits_per_image.shape[-1])
logits_per_text = logits_per_image_2d.T

# Softmax and print results
softmax_output = np.exp(logits_per_image_2d) / np.sum(np.exp(logits_per_image_2d), axis=1, keepdims=True)
print(softmax_output)


[[6.2032657e-10 3.2358432e-08 1.0768239e-09 1.0000000e+00]]
