In [1]:
import torch
from PIL import Image
import mobileclip

model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='checkpoints/mobileclip_s0.pt')
tokenizer = mobileclip.get_tokenizer('mobileclip_s0')

image = preprocess(Image.open("docs/fig_accuracy_latency.png").convert('RGB')).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)

Label probs: tensor([[1.0000e+00, 1.5044e-06, 5.7435e-07]])


In [2]:
print(image_features)

tensor([[-2.1176e-02, -7.9294e-04, -6.1821e-03,  7.4502e-02, -1.5978e-02,
          6.0990e-02,  5.1152e-02, -2.5785e-02, -6.3818e-02,  2.9069e-02,
          5.2697e-02, -1.5465e-02,  2.4507e-02,  1.2557e-02, -5.7445e-02,
          6.0435e-03, -2.0533e-02, -9.0653e-02, -2.1529e-02, -6.8538e-02,
          5.2537e-02,  1.0857e-01, -6.3223e-02, -5.3994e-02, -3.6081e-02,
         -1.5818e-02,  2.2301e-03, -3.7293e-02, -7.0142e-03, -5.8539e-03,
         -6.0963e-03, -1.2645e-01, -3.7575e-02, -1.6215e-04, -8.0482e-02,
          8.1267e-04,  6.0555e-02,  3.3925e-02,  6.1165e-04, -1.4675e-02,
         -7.0245e-02, -1.5489e-02, -8.4071e-03,  6.0547e-03, -1.8131e-02,
         -3.1972e-03, -8.8677e-03, -8.7627e-02,  4.2673e-02, -2.2631e-02,
          3.5060e-03, -8.6445e-03,  1.4013e-02, -4.7479e-02, -2.4809e-02,
          2.9249e-02, -3.3640e-02,  2.3204e-04,  9.9846e-03, -2.6083e-02,
         -5.8439e-02,  3.0947e-02, -2.7547e-02,  7.1585e-02, -5.4962e-02,
         -1.2639e-01,  1.7581e-02, -4.

In [3]:
print(text_features)

tensor([[ 0.0030,  0.0117, -0.0394,  ...,  0.0028,  0.0085, -0.0166],
        [-0.0145,  0.0015,  0.0754,  ...,  0.0057,  0.0068, -0.0060],
        [ 0.0072,  0.0242,  0.0922,  ...,  0.0006, -0.0150,  0.0031]])


In [26]:
print(model)

CLIP(
  (image_encoder): MCi(
    (model): FastViT(
      (patch_embed): Sequential(
        (0): MobileOneBlock(
          (se): Identity()
          (activation): GELU(approximate='none')
          (reparam_conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        )
        (1): MobileOneBlock(
          (se): Identity()
          (activation): GELU(approximate='none')
          (reparam_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64)
        )
        (2): MobileOneBlock(
          (se): Identity()
          (activation): GELU(approximate='none')
          (reparam_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        )
      )
      (network): ModuleList(
        (0): Sequential(
          (0): RepMixerBlock(
            (token_mixer): RepMixer(
              (reparam_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
            )
            (convffn): ConvFFN(
              (con

In [27]:
print(preprocess)

Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=None)
    CenterCrop(size=(256, 256))
    ToTensor()
)


In [28]:
print(tokenizer)

ClipTokenizer()


In [1]:
import os
os.path.isfile("/home/user/data/LightClip/CLIP-KD/logs/2024_06_03-14_40_17-t_model_['mobileclip_s0']-s_model_mobileclip_s0-lr_0.001-b_512-tag_distill-new/checkpoints/epoch_32.pt")

True