## Baseline Training Data Generation

Here, we perform a similarity search on the texts_images collection to identify image–text pairs, which serve as the baseline training dataset for subsequent fine-tuning tasks.

In [1]:
# Importing useful dependencies
import io
import torch
import boto3
import chromadb
import open_clip
import numpy as np
from PIL import Image
from io import BytesIO

# Set a seed for reproducibility
SEED = 10721
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\

AttributeError: _ARRAY_API not found

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)

# Create or get the collection named "texts_images" to store embeddings of images and texts
collection_texts_images = client.create_collection(name="texts_images", get_or_create=True, embedding_function=None)

In [6]:
# We create a new Bucket in Min-IO to store our training data

# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("training-data-construction-zone", buckets)
# Sub-bucket: Baseline Training Data
s3.put_object(Bucket="training-data-construction-zone", Key="baseline-training-data/")

Bucket 'training-data-construction-zone' already exists!


{'ResponseMetadata': {'RequestId': '1877A5207DB01D9E',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '1877A5207DB01D9E',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '2107',
   'x-ratelimit-remaining': '2107',
   'x-xss-protection': '1; mode=block',
   'date': 'Thu, 13 Nov 2025 18:42:18 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [4]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model, _, _ = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
tokenizer = open_clip.get_tokenizer("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K") # Tokenizer for texts
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

In [50]:
# Some helper functions

# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(model, tokenizer, texts: str):
    tokens = tokenizer([texts]).to(device) # tokenized batch
    feats = model.encode_text(tokens)
    feats = feats / feats.norm(dim=-1, keepdim=True) # normalize
    return feats.cpu().numpy()[0]

In [49]:
# This function performs a similarity search for each text description in the dataset 
# to retrieve the most similar image, forming image–text pairs for training.
def baseline_training_data_generator(src_bucket, dest_bucket, collection, model_text, tokenizer, src_prefix="texts/", dest_prefix="baseline-training-data/"):

    # Incremental id assigned to each image-text pair
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        image_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []

        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Get the description
            description = get_text(src_bucket, key)
            # Get the embeddings of the description
            q_vec = embed_text(model_text, tokenizer, description)
            # Apply the similarity search using the description
            res_image = collection.query(
                query_embeddings=[q_vec],
                n_results=1,
                where={"type": "image"}, # Filter by metadata type
                include=["documents", "distances"]
            )
            # Get the key for the image
            key_image = res_image['documents'][0][0][len(src_bucket) + 1:]

            # Remove the prefix part from the key
            new_key_text = dest_prefix + "text_" + str(id_counter).zfill(6) + ".txt" # ids of 000001, 000002, ...
            new_key_image = dest_prefix + "image_" + str(id_counter).zfill(6) + ".png" # ids of 000001, 000002, ...

            # Copy objects without top-level folder and rename them
            copy_source_text = {"Bucket": src_bucket, "Key": key}
            copy_source_image = {"Bucket": src_bucket, "Key": key_image}
            s3.copy_object(Bucket=dest_bucket, Key=new_key_text, CopySource=copy_source_text)
            s3.copy_object(Bucket=dest_bucket, Key=new_key_image, CopySource=copy_source_image)

            print(f"✅ Baseline training pair #{id_counter} created successfully.")

    print(f"✅ All training pairs have been successfully created.")

In [51]:
# Create training data
baseline_training_data_generator(src_bucket = "trusted-zone", dest_bucket = "training-data-construction-zone",
                                 collection = collection_texts_images, model_text = model, tokenizer=tokenizer)

✅ Baseline training pair #1 created successfully.
✅ Baseline training pair #2 created successfully.
✅ Baseline training pair #3 created successfully.
✅ Baseline training pair #4 created successfully.
✅ Baseline training pair #5 created successfully.
✅ Baseline training pair #6 created successfully.
✅ Baseline training pair #7 created successfully.
✅ Baseline training pair #8 created successfully.
✅ Baseline training pair #9 created successfully.
✅ Baseline training pair #10 created successfully.
✅ Baseline training pair #11 created successfully.
✅ Baseline training pair #12 created successfully.
✅ Baseline training pair #13 created successfully.
✅ Baseline training pair #14 created successfully.
✅ Baseline training pair #15 created successfully.
✅ Baseline training pair #16 created successfully.
✅ Baseline training pair #17 created successfully.
✅ Baseline training pair #18 created successfully.
✅ Baseline training pair #19 created successfully.
✅ Baseline training pair #20 created suc