## ------------------------------

In [1]:
!pip install boto3 chromadb open_clip_torch

Collecting boto3
  Downloading boto3-1.41.3-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting botocore<1.42.0,>=1.41.3 (from boto3)
  Downloading botocore-1.41.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.16.0,>=0.15.0 (from boto3)
  Downloading s3transfer-0.15.0-py3-none-any.whl.metadata (1.7 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posth

In [2]:
# Importing useful dependencies
import io
import torch
import boto3
import random
import chromadb
import open_clip
import numpy as np
from PIL import Image
from io import BytesIO
from chromadb.config import Settings

# Set a seed for reproducibility
SEED = 10721
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    #endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    endpoint_url="https://statueless-manducatory-renato.ngrok-free.dev", # MinIO API endpoint (ngrok)
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [4]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP ViT-L/16
model_name = "ViT-B-16"
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
tokenizer = open_clip.get_tokenizer(model_name)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [5]:
# ---- Show parameter counts ----
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model: CLIP-ViT-B-16")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"The parameters are in: {str(next(model.parameters()).dtype)}") # FP32

Model: CLIP-ViT-B-16
Total parameters: 149,620,737
Trainable parameters: 149,620,737
The parameters are in: torch.float32


### Dataloader

In [6]:
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import functools
from torchvision import transforms
from typing import List, Tuple

class ImageTextDataset(Dataset):
    def __init__(self, data_bucket, data_prefix, s3):
        self.data_bucket = data_bucket
        self.data_prefix = data_prefix
        self.s3 = s3

        # Data keys
        self.image_keys, self.text_keys = self.__loadfromminio__(data_bucket, data_prefix)

    def __loadfromminio__(self, data_bucket, data_prefix):
        image_keys = []
        text_keys = []
        paginator = self.s3.get_paginator("list_objects_v2")
        for page in paginator.paginate(Bucket=data_bucket, Prefix=data_prefix):
            for obj in page.get("Contents", []):
                key = obj["Key"]
                if obj['Size'] == 0 and key.endswith("/"):
                    continue
                if "image" in key.split("/")[1]: # We only need images to find their corresponding description in MinIO
                    image_keys.append(key)
                    text_key = data_prefix + key.split("/")[1].replace("image", "text").replace("png", "txt")
                    text_keys.append(text_key)

        # From lists to arrays
        image_keys = np.array(image_keys)
        text_keys = np.array(text_keys)

        return image_keys, text_keys

    def __len__(self):
        return len(self.image_keys)

    def __getfile__(self, data_bucket, key, filetype = "image"):
        resp = self.s3.get_object(Bucket=data_bucket, Key=key)
        body = resp["Body"].read()
        if filetype == "image":
            file = Image.open(BytesIO(body))
        else: # filetype = "text"
            file = body.decode("utf-8")
        return file

    def __getitem__(self, idx):

        # Load image
        image = self.__getfile__(self.data_bucket, self.image_keys[idx], filetype = "image")

        # Load text
        text = self.__getfile__(self.data_bucket, self.text_keys[idx], filetype = "text")

        return image, text

# Collate function that applies preprocess and tokenizer to the batch
def collate_fn(batch: List[Tuple["PIL.Image.Image", str]], preprocess, tokenizer, pad_value: int = 0):
    """
    batch: list of (PIL.Image, text_str)
    preprocess: image preprocessing transform (from open_clip.create_model_and_transforms)
    tokenizer: open_clip tokenizer callable
    pad_value: value used to pad token sequences (default 0)

    Returns:
        images: torch.Tensor [B, C, H, W]
        text_tokens: torch.LongTensor [B, L]
        raw_texts: list[str]
    """

    images_pil, raw_texts = zip(*batch) # tuples

    # --- Images: apply model-specific preprocess (PIL->Tensor) and stack ---
    images = [preprocess(img) for img in images_pil] # each should be a Tensor
    images = torch.stack(images, dim=0) # [B, C, H, W]

    # --- Texts: use tokenizer ---
    # Many open_clip tokenizers accept a list[str] and return a torch.LongTensor [B, L].
    # But sometimes they may return a list of tensors or lists. Handle both cases.
    tokenized = tokenizer(raw_texts) # [B, L]

    return images, tokenized

# Wrap collate_fn so DataLoader only sees a single-argument function
collate = functools.partial(collate_fn, preprocess=preprocess, tokenizer=tokenizer, pad_value=0)

In [7]:
# Create customized dataset object
baseline_dataset = ImageTextDataset(
    data_bucket = "training-data-construction-zone",
    data_prefix = "baseline-training-data/",
    s3 = s3
)

In [8]:
# Apply a shuffle over the dataset to prevent the model from learning order-based patterns
dataloader = DataLoader(baseline_dataset, batch_size=32, shuffle=True, collate_fn=collate, num_workers=0)
# Increasing the size of the batch slows down the training process, but generalizes better the model

In [None]:
# Split the dataloader into train/dev/test

In [9]:
import torch
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset
from open_clip.loss import ClipLoss
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim

criterion = ClipLoss()

# Test fine-tuning on ViT-B-16 with Mixed/Reduced Precision (FP16)

losses = []  # Store losses

# AdamW optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-6) # Big lr leads to overfitting on the training data -> the model will get better results on the training data, but poor results on the test set
scaler = GradScaler() # For automatic mixed precision, prevents gradient underflow
# Gradient underflow: FP16 has smaller numeric range than FP32. Very small gradients may become so tiny that FP16 rounds them to zero.
# This is called underflow, and it effectively "kills" the learning signal for some parameters.

for images, texts in dataloader:
    images, texts = images.to(device), texts.to(device)

    optimizer.zero_grad()

    with autocast(): # the autocast() context ensures operations automatically use FP16 where safe
        img_feats = model.encode_image(images)
        txt_feats = model.encode_text(texts)
        loss = criterion(img_feats, txt_feats, model.logit_scale.exp())
    print(f"Loss: {loss.item():.4f}")

    # Save loss
    losses.append(loss.item())

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

# Plot the losses
plt.figure(figsize=(8,5))
plt.plot(losses, marker='o')
plt.title("Training Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid(True)
plt.show()

# ~45 mins

  scaler = GradScaler() # For automatic mixed precision, prevents gradient underflow
  super().__init__(
  with autocast(): # the autocast() context ensures operations automatically use FP16 where safe


Loss: 38.7926
Loss: 32.6842
Loss: 14.1557
Loss: 84.0698
Loss: 14.9327


KeyboardInterrupt: 

In [10]:
torch.save(model.state_dict(), "clip_finetuned.pt")

In [11]:
# Create model (openai weights)
model_base, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
model_base.to(device)

# Create model (uninitialized weights)
model_ft, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
model_ft.to(device)
model_ft.load_state_dict(torch.load("clip_finetuned.pt", map_location=device))
model_ft.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [12]:
import torch
import torch.nn.functional as F

# Set both models to eval
model_ft.eval()    # fine-tuned
model_base.eval()  # original

images, texts = next(iter(dataloader))  # take a batch
images, texts = images.to(device), texts.to(device)

with torch.no_grad():
    # Base model
    img_feats_base = F.normalize(model_base.encode_image(images), dim=-1)
    txt_feats_base = F.normalize(model_base.encode_text(texts), dim=-1)

    # Fine-tuned model
    img_feats_ft = F.normalize(model_ft.encode_image(images), dim=-1)
    txt_feats_ft = F.normalize(model_ft.encode_text(texts), dim=-1)

# Compare embedding changes (cosine similarity)
cos_sim_img = (img_feats_base * img_feats_ft).sum(dim=-1).mean()
cos_sim_txt = (txt_feats_base * txt_feats_ft).sum(dim=-1).mean()

print(f"Avg cosine similarity for images: {cos_sim_img:.4f}")
print(f"Avg cosine similarity for text:   {cos_sim_txt:.4f}")


Avg cosine similarity for images: 0.9989
Avg cosine similarity for text:   0.9991


In short: negative cosine similarity to the original model is fine when fine-tuning for a task the base model didn’t see — the important thing is whether the fine-tuned model is performing well on your new objective.

If you want, I can show a workflow to evaluate your fine-tuned model on the new task, so you can check performance without worrying about alignment to the original CLIP embeddings.

In [13]:
def retrieval_accuracy(img_feats, txt_feats):
    # Compute similarity matrix
    sims = img_feats @ txt_feats.t()
    # Ground truth: assume diagonal matches
    labels = torch.arange(img_feats.size(0)).to(img_feats.device)

    # Image->Text
    top1_i2t = (sims.argmax(dim=1) == labels).float().mean()
    # Text->Image
    top1_t2i = (sims.argmax(dim=0) == labels).float().mean()

    return top1_i2t.item(), top1_t2i.item()

acc_base = retrieval_accuracy(img_feats_base, txt_feats_base)
acc_ft   = retrieval_accuracy(img_feats_ft, txt_feats_ft)

print(f"Base model retrieval (image->text, text->image): {acc_base}")
print(f"Fine-tuned model retrieval: {acc_ft}")


Base model retrieval (image->text, text->image): (0.84375, 0.84375)
Fine-tuned model retrieval: (0.84375, 0.875)


In [14]:
from open_clip.loss import ClipLoss

criterion = ClipLoss()

loss_base = criterion(img_feats_base, txt_feats_base, model_base.logit_scale.exp())
loss_ft   = criterion(img_feats_ft, txt_feats_ft, model_ft.logit_scale.exp())

print(f"Base model contrastive loss: {loss_base.item():.4f}")
print(f"Fine-tuned model contrastive loss: {loss_ft.item():.4f}")


Base model contrastive loss: 0.4767
Fine-tuned model contrastive loss: 0.4814


In [15]:
# similarity matrix
sims = img_feats_ft @ txt_feats_ft.t()
top_idx = sims[0].argmax()
print(f"Most similar text to first image: {texts[top_idx]}")


Most similar text to first image: tensor([49406,   761,  1394,   274,   533,   550,  1488,   268,  1002,  1816,
         4377,   593,   518,  3638,  1378,   718,   525,  4511,   539,  7584,
          269,   585,   533,   518, 42582,  3644,   530,   518,   761,  1394,
         1857,  7479,   638, 25815,  6231,   267, 15361,  1241,   638,   518,
          874,  1237, 11442,  1110,  5149,   537,  1977,   518,   275,   640,
         1551,   783,  2296,  3410,   601,   902, 34774,   269, 11362, 14057,
          533,   518, 38183,   539,   518,  1063,   267,   822,   533, 34086,
         1265,  8201,   531,  2696,   593, 33940, 49407])


### Mixed/Reduced Precision (FP16)

Memory usage is roughly halved, and training is faster.

In [None]:
# Two options:

In [None]:
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim

# Example optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler() # For automatic mixed precision, prevents gradient underflow
# Gradient underflow: FP16 has smaller numeric range than FP32. Very small gradients may become so tiny that FP16 rounds them to zero.
# This is called underflow, and it effectively "kills" the learning signal for some parameters.

for images, texts in dataloader:
    images, texts = images.to(device), texts.to(device)

    optimizer.zero_grad()

    with autocast(): # the autocast() context ensures operations automatically use FP16 where safe
        image_features, text_features = model(images, tokenizer(texts))
        loss = compute_loss(image_features, text_features)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()


  scaler = GradScaler() # For automatic mixed precision, prevents gradient underflow
  with autocast(): # the autocast() context ensures operations automatically use FP16 where safe


AttributeError: 'Tensor' object has no attribute 'find'

In [None]:
model = model.half() # FP32 -> FP16

In [None]:
# We must convert the input tensors?
image = image.half().to(device)
text_tokens = text_tokens.to(device)

NameError: name 'image' is not defined

### Quantization (INT4)

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

model = open_clip.create_model("ViT-L-14", quantized=True, bnb_config=bnb_config)


PackageNotFoundError: No package metadata was found for bitsandbytes

### LoRA (Low-Rank Adaptation)

In [None]:
#This is the best option for fine-tuning CLIP:
# * Freeze the whole model
# * Insert small trainable LoRA layers
# * Train only 1–2% new parameters

In [None]:
#pip install peft

In [None]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],  # typical for transformer models
)

model.text = get_peft_model(model.text, lora_cfg)
model.text.print_trainable_parameters()


AttributeError: 'CLIP' object has no attribute 'text'

In [None]:
# print top-level modules
print(model)

# print children names
for name, module in model.named_children():
    print(name, type(module))

# print a few text-related submodules (common in open_clip)
for name, module in model.named_modules():
    if "token" in name or "transformer" in name or "ln_" in name or "text" in name:
        print(name, type(module))


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [None]:
token_emb = model.token_embedding
text_transformer = model.transformer
ln_final = model.ln_final   # or model.ln_post depending on the printout

In [None]:
# reeze the whole model (prepare for LoRA)
#Usually you want to freeze the base model and train only small adapter parameters:

for param in model.parameters():
    param.requires_grad = False

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable params:", trainable)


Trainable params: 0


In [None]:
#Add LoRA adapters to transformer linear layers
#pip install loralib

# peft is primarily designed for Hugging Face models. For OpenCLIP, a simple and compatible approach is to use loralib (lightweight LoRA wrapper)

### Wrap target Linear layers (q/k/v projections or all Linear)

import torch.nn as nn
import loralib as lora

# Example helper: wrap Linear modules within a module whose name contains 'transformer'
def apply_lora_to_transformer(root_module, r=8, alpha=32, target_module_type=nn.Linear, name_filter=None):
    """
    Wrap Linear layers inside root_module with LoRA.
    - r, alpha: LoRA hyperparams
    - name_filter: optional substring to filter which modules to wrap (e.g. 'attn' or 'q_proj')
    """
    for name, mod in root_module.named_modules():
        # We only want to wrap the *leaf* Linear modules, not the parent modules
        if isinstance(mod, target_module_type):
            if name_filter is None or name_filter in name:
                parent_path = name.rsplit('.', 1)[0] if '.' in name else ''
                # rebind module in parent
                parent = root_module
                if parent_path:
                    for part in parent_path.split('.'):
                        parent = getattr(parent, part)
                attr_name = name.split('.')[-1]
                orig = getattr(parent, attr_name)
                # create LoRA-wrapped layer with same in/out dims
                lora_layer = lora.Linear(orig.in_features, orig.out_features, r=r, lora_alpha=alpha, bias=(orig.bias is not None))
                # copy weight and bias
                lora_layer.weight.data = orig.weight.data.clone()
                if orig.bias is not None:
                    lora_layer.bias.data = orig.bias.data.clone()
                # replace
                setattr(parent, attr_name, lora_layer)
                print(f"Replaced {name} with LoRA Linear (r={r}, alpha={alpha})")

# Apply to the text transformer
apply_lora_to_transformer(model.transformer, r=8, alpha=32, name_filter="attn")  # focus on attention proj


# name_filter helps target q_proj, k_proj, v_proj, or modules with attn in the path.
#Inspect your printed module names to choose an appropriate filter.

# This replaces nn.Linear objects with loralib.Linear that contain LoRA parameters;
# those LoRA parameters will be trainable while base weights remain frozen (unless you unfreeze them).

### Make LoRA params trainable and check

# Ensure base params still frozen, LoRA params trainable
for name, p in model.named_parameters():
    if "lora" in name.lower() or "lora" in name:
        p.requires_grad = True
    else:
        p.requires_grad = False

# Print trainable params count
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable} / {total}")

### Training loop: mixed precision + optimizer

from torch.cuda.amp import autocast, GradScaler
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=1e-4)
scaler = GradScaler()

model.to(device)
model.train()

for images, texts in dataloader:
    images = images.to(device)
    text_tokens = tokenizer(texts).to(device)

    optimizer.zero_grad()
    with autocast():
        image_features = model.encode_image(images)   # or model.visual(images)
        text_features = model.encode_text(text_tokens) # or model.transformer(...)
        loss = compute_loss(image_features, text_features)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()


Replaced resblocks.0.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.1.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.2.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.3.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.4.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.5.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.6.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.7.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.8.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.9.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.10.attn.out_proj with LoRA Linear (r=8, alpha=32)
Replaced resblocks.11.attn.out_proj with LoRA Linear (r=8, alpha=32)
Trainable params: 98304 / 149719041


  scaler = GradScaler()


AttributeError: 'Tensor' object has no attribute 'find'

In [None]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [None]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)

# Create or get the collection named "texts_images" to store embeddings of images and texts
collection_texts_images = client.create_collection(name="texts_images", get_or_create=True, embedding_function=None)

In [None]:
# We create a new Bucket in Min-IO to store our training data

# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("training-data-construction-zone", buckets)
# Sub-bucket: Baseline Training Data
s3.put_object(Bucket="training-data-construction-zone", Key="baseline-training-data/")

Bucket 'training-data-construction-zone' already exists!


{'ResponseMetadata': {'RequestId': '1877A5207DB01D9E',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '1877A5207DB01D9E',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '2107',
   'x-ratelimit-remaining': '2107',
   'x-xss-protection': '1; mode=block',
   'date': 'Thu, 13 Nov 2025 18:42:18 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [None]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model, _, _ = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
tokenizer = open_clip.get_tokenizer("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K") # Tokenizer for texts
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

In [None]:
# Some helper functions

# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(model, tokenizer, texts: str):
    tokens = tokenizer([texts]).to(device) # tokenized batch
    feats = model.encode_text(tokens)
    feats = feats / feats.norm(dim=-1, keepdim=True) # normalize
    return feats.cpu().numpy()[0]

In [None]:
# This function performs a similarity search for each text description in the dataset
# to retrieve the most similar image, forming image–text pairs for training.
def baseline_training_data_generator(src_bucket, dest_bucket, collection, model_text, tokenizer, src_prefix="texts/", dest_prefix="baseline-training-data/"):

    # Incremental id assigned to each image-text pair
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        image_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []

        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Get the description
            description = get_text(src_bucket, key)
            # Get the embeddings of the description
            q_vec = embed_text(model_text, tokenizer, description)
            # Apply the similarity search using the description
            res_image = collection.query(
                query_embeddings=[q_vec],
                n_results=1,
                where={"type": "image"}, # Filter by metadata type
                include=["documents", "distances"]
            )
            # Get the key for the image
            key_image = res_image['documents'][0][0][len(src_bucket) + 1:]

            # Remove the prefix part from the key
            new_key_text = dest_prefix + "text_" + str(id_counter).zfill(6) + ".txt" # ids of 000001, 000002, ...
            new_key_image = dest_prefix + "image_" + str(id_counter).zfill(6) + ".png" # ids of 000001, 000002, ...

            # Copy objects without top-level folder and rename them
            copy_source_text = {"Bucket": src_bucket, "Key": key}
            copy_source_image = {"Bucket": src_bucket, "Key": key_image}
            s3.copy_object(Bucket=dest_bucket, Key=new_key_text, CopySource=copy_source_text)
            s3.copy_object(Bucket=dest_bucket, Key=new_key_image, CopySource=copy_source_image)

            print(f"✅ Baseline training pair #{id_counter} created successfully.")

    print(f"✅ All training pairs have been successfully created.")