In [5]:
import numpy as np
import torch, transformers, os
from utilities import embed, capture_full


In [16]:
import io, numpy as np, onnxruntime as ort
from PIL import Image

# ------------------------------------------------------------------
# 0) Create the ONNX session once
img_session = ort.InferenceSession("tiny_clip/model.onnx")

# ------------------------------------------------------------------
# 1) Utilities ------------------------------------------------------

def compress_image(img, quality=100):
    import io
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=quality)
    return buf.getvalue()

def embed(session: ort.InferenceSession, jpeg_bytes: bytes) -> np.ndarray:
    """JPEG bytes --> 512-D Tiny-CLIP vector."""
    img = (Image.open(io.BytesIO(jpeg_bytes))
           .convert("RGB")
           .resize((224, 224), Image.Resampling.BICUBIC))

    arr = (np.asarray(img, dtype=np.float32).transpose(2, 0, 1) / 127.5) - 1.0
    arr = arr[np.newaxis, ...]              # (1,3,224,224)

    ids  = np.zeros((1, 77), dtype=np.int64)    # dummy
    mask = np.ones((1, 77),  dtype=np.int64)    # dummy
    feeds = {}
    for inp in session.get_inputs():
        if "pixel"  in inp.name: feeds[inp.name] = arr
        elif "mask" in inp.name: feeds[inp.name] = mask
        else:                    feeds[inp.name] = ids

    vec = session.run(["image_embeds"], feeds)[0]   # (1,512)
    return vec[0]                                   # (512,)

# ------------------------------------------------------------------
# 3) End-to-end -----------------------------------------------------
#full_img     = capture_full()               #  ⟵ PIL.Image
full_img = Image.open("screenshot_001.jpg").convert("RGB")
jpeg_bytes   = compress_image(full_img)     #  ⟵ raw bytes
vec          = embed(img_session, jpeg_bytes)

print(f"Vector shape: {vec.shape}, dtype: {vec.dtype}")
# ► Vector shape: (512,), dtype: float32


Vector shape: (512,), dtype: float32


In [15]:
full_img.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# --------------------------------------------------------------
import io, numpy as np, onnxruntime as ort
from PIL import Image

# ------------------------------------------------------------------
# 0) Create the ONNX session once
img_session = ort.InferenceSession("tiny_clip/model.onnx")

# ------------------------------------------------------------------
# 1) Utilities ------------------------------------------------------

def compress_image(img, quality=85):
    import io
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=quality)
    return buf.getvalue()

def embed(session: ort.InferenceSession, jpeg_bytes: bytes) -> np.ndarray:
    """JPEG bytes --> 512-D Tiny-CLIP vector."""
    img = (Image.open(io.BytesIO(jpeg_bytes))
           .convert("RGB")
           .resize((224, 224), Image.Resampling.BICUBIC))

    arr = (np.asarray(img, dtype=np.float32).transpose(2, 0, 1) / 127.5) - 1.0
    arr = arr[np.newaxis, ...]              # (1,3,224,224)

    ids  = np.zeros((1, 77), dtype=np.int64)    # dummy
    mask = np.ones((1, 77),  dtype=np.int64)    # dummy
    feeds = {}
    for inp in session.get_inputs():
        if "pixel"  in inp.name: feeds[inp.name] = arr
        elif "mask" in inp.name: feeds[inp.name] = mask
        else:                    feeds[inp.name] = ids

    vec = session.run(["image_embeds"], feeds)[0]   # (1,512)
    return vec[0]                                   # (512,)

# ------------------------------------------------------------------
# 3) End-to-end -----------------------------------------------------
#full_img     = capture_full()               #  ⟵ PIL.Image
full_img = Image.open("screenshot_001.jpg").convert("RGB")
jpeg_bytes   = compress_image(full_img)     #  ⟵ raw bytes
vec          = embed(img_session, jpeg_bytes)

print(f"Vector shape: {vec.shape}, dtype: {vec.dtype}")
# ► Vector shape: (512,), dtype: float32



LABELS = ["login screen", "error dialog", "code editor",
          "settings page", "spreadsheet", "browser home page", "vscode", "Microsoft Word", "OverLeaf", "writing a paper"]

# Make a *text* embedding table once (reuse the SAME session)
from transformers import CLIPTokenizerFast
tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")

def text_embed(sentences):
    toks = tok(sentences, padding=True, return_tensors="np")
    feeds = {
        "input_ids":      toks["input_ids"],
        "attention_mask": toks["attention_mask"],
        # dummy image input so ONNX doesn’t complain
        "pixel_values":   np.zeros((len(sentences), 3, 224, 224), np.float32)
    }
    return img_session.run(["text_embeds"], feeds)[0]

label_vecs = text_embed(LABELS)
label_vecs /= np.linalg.norm(label_vecs, axis=1, keepdims=True)

def describe(vec, top_k=3):
    vec  = vec / np.linalg.norm(vec)
    sims = label_vecs @ vec
    idx  = sims.argsort()[-top_k:][::-1]
    return [(LABELS[i], float(sims[i])) for i in idx]

print(describe(vec))


[('code editor', 0.26631882786750793), ('OverLeaf', 0.2621487081050873), ('Microsoft Word', 0.259771466255188)]
