## ------------------------------

In [13]:
# Importing useful dependencies
import io
import torch
import boto3
import random
import chromadb
import open_clip
import numpy as np
from PIL import Image
from io import BytesIO
from chromadb.config import Settings

# Set a seed for reproducibility
SEED = 10721
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [11]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP ViT-L/16
model_name = "ViT-B-16"
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
tokenizer = open_clip.get_tokenizer(model_name)
model.to(device)



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [27]:
# ---- Show parameter counts ----
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model: CLIP-ViT-B-16")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"The parameters are in: {str(next(model.parameters()).dtype)}") # FP32

Model: CLIP-ViT-B-16
Total parameters: 149,620,737
Trainable parameters: 149,620,737
The parameters are in: torch.float32


### Mixed/Reduced Precision (FP16)

Memory usage is roughly halved, and training is faster.

In [None]:
# Two options:

In [None]:
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim

# Example optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler() # For automatic mixed precision, prevents gradient underflow
# Gradient underflow: FP16 has smaller numeric range than FP32. Very small gradients may become so tiny that FP16 rounds them to zero.
# This is called underflow, and it effectively "kills" the learning signal for some parameters.

for images, texts in dataloader:
    images, texts = images.to(device), texts.to(device)

    optimizer.zero_grad()
    
    with autocast(): # the autocast() context ensures operations automatically use FP16 where safe
        image_features, text_features = model(images, tokenizer(texts))
        loss = compute_loss(image_features, text_features)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()


In [None]:
model = model.half() # FP32 -> FP16

In [None]:
# We must convert the input tensors?
image = image.half().to(device)
text_tokens = text_tokens.to(device)

### Quantization (INT4)

In [29]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

model = open_clip.create_model("ViT-L-14", quantized=True, bnb_config=bnb_config)


torch.float16

### LoRA (Low-Rank Adaptation)

In [None]:
#This is the best option for fine-tuning CLIP:
# * Freeze the whole model
# * Insert small trainable LoRA layers
# * Train only 1–2% new parameters

In [None]:
#pip install peft

In [None]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],  # typical for transformer models
)

model.text = get_peft_model(model.text, lora_cfg)
model.text.print_trainable_parameters()


In [31]:
# print top-level modules
print(model)

# print children names
for name, module in model.named_children():
    print(name, type(module))

# print a few text-related submodules (common in open_clip)
for name, module in model.named_modules():
    if "token" in name or "transformer" in name or "ln_" in name or "text" in name:
        print(name, type(module))


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [32]:
token_emb = model.token_embedding
text_transformer = model.transformer
ln_final = model.ln_final   # or model.ln_post depending on the printout

In [34]:
# reeze the whole model (prepare for LoRA)
#Usually you want to freeze the base model and train only small adapter parameters:

for param in model.parameters():
    param.requires_grad = False

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable params:", trainable)


Trainable params: 0


In [None]:
#Add LoRA adapters to transformer linear layers
#pip install loralib

# peft is primarily designed for Hugging Face models. For OpenCLIP, a simple and compatible approach is to use loralib (lightweight LoRA wrapper)

### Wrap target Linear layers (q/k/v projections or all Linear)

import torch.nn as nn
import loralib as lora

# Example helper: wrap Linear modules within a module whose name contains 'transformer'
def apply_lora_to_transformer(root_module, r=8, alpha=32, target_module_type=nn.Linear, name_filter=None):
    """
    Wrap Linear layers inside root_module with LoRA.
    - r, alpha: LoRA hyperparams
    - name_filter: optional substring to filter which modules to wrap (e.g. 'attn' or 'q_proj')
    """
    for name, mod in root_module.named_modules():
        # We only want to wrap the *leaf* Linear modules, not the parent modules
        if isinstance(mod, target_module_type):
            if name_filter is None or name_filter in name:
                parent_path = name.rsplit('.', 1)[0] if '.' in name else ''
                # rebind module in parent
                parent = root_module
                if parent_path:
                    for part in parent_path.split('.'):
                        parent = getattr(parent, part)
                attr_name = name.split('.')[-1]
                orig = getattr(parent, attr_name)
                # create LoRA-wrapped layer with same in/out dims
                lora_layer = lora.Linear(orig.in_features, orig.out_features, r=r, lora_alpha=alpha, bias=(orig.bias is not None))
                # copy weight and bias
                lora_layer.weight.data = orig.weight.data.clone()
                if orig.bias is not None:
                    lora_layer.bias.data = orig.bias.data.clone()
                # replace
                setattr(parent, attr_name, lora_layer)
                print(f"Replaced {name} with LoRA Linear (r={r}, alpha={alpha})")

# Apply to the text transformer
apply_lora_to_transformer(model.transformer, r=8, alpha=32, name_filter="attn")  # focus on attention proj


# name_filter helps target q_proj, k_proj, v_proj, or modules with attn in the path.
#Inspect your printed module names to choose an appropriate filter.

# This replaces nn.Linear objects with loralib.Linear that contain LoRA parameters;
# those LoRA parameters will be trainable while base weights remain frozen (unless you unfreeze them).

### Make LoRA params trainable and check

# Ensure base params still frozen, LoRA params trainable
for name, p in model.named_parameters():
    if "lora" in name.lower() or "lora" in name:
        p.requires_grad = True
    else:
        p.requires_grad = False

# Print trainable params count
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable} / {total}")

### Training loop: mixed precision + optimizer

from torch.cuda.amp import autocast, GradScaler
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=1e-4)
scaler = GradScaler()

model.to(device)
model.train()

for images, texts in dataloader:
    images = images.to(device)
    text_tokens = tokenizer(texts).to(device)

    optimizer.zero_grad()
    with autocast():
        image_features = model.encode_image(images)   # or model.visual(images)
        text_features = model.encode_text(text_tokens) # or model.transformer(...)
        loss = compute_loss(image_features, text_features)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()


### Dataloader

In [None]:
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class ImageTextDataset(Dataset):
    def __init__(self, csv_path, image_root, tokenizer, image_transform=None):
        self.df = pd.read_csv(csv_path)
        self.image_root = image_root
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Load image
        img_path = f"{self.image_root}/{self.df.iloc[idx]['image_path']}"
        image = Image.open(img_path).convert("RGB")

        if self.image_transform:
            image = self.image_transform(image)

        # Tokenize text
        caption = self.df.iloc[idx]['caption']
        text_tokens = self.tokenizer([caption])[0]

        return image, text_tokens


In [None]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

dataset = ImageTextDataset(
    csv_path="train.csv",
    image_root="images/",
    tokenizer=tokenizer,
    image_transform=image_transform
)

dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)

# Create or get the collection named "texts_images" to store embeddings of images and texts
collection_texts_images = client.create_collection(name="texts_images", get_or_create=True, embedding_function=None)

In [6]:
# We create a new Bucket in Min-IO to store our training data

# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("training-data-construction-zone", buckets)
# Sub-bucket: Baseline Training Data
s3.put_object(Bucket="training-data-construction-zone", Key="baseline-training-data/")

Bucket 'training-data-construction-zone' already exists!


{'ResponseMetadata': {'RequestId': '1877A5207DB01D9E',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '1877A5207DB01D9E',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '2107',
   'x-ratelimit-remaining': '2107',
   'x-xss-protection': '1; mode=block',
   'date': 'Thu, 13 Nov 2025 18:42:18 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [4]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model, _, _ = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
tokenizer = open_clip.get_tokenizer("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K") # Tokenizer for texts
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

In [50]:
# Some helper functions

# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(model, tokenizer, texts: str):
    tokens = tokenizer([texts]).to(device) # tokenized batch
    feats = model.encode_text(tokens)
    feats = feats / feats.norm(dim=-1, keepdim=True) # normalize
    return feats.cpu().numpy()[0]

In [49]:
# This function performs a similarity search for each text description in the dataset 
# to retrieve the most similar image, forming image–text pairs for training.
def baseline_training_data_generator(src_bucket, dest_bucket, collection, model_text, tokenizer, src_prefix="texts/", dest_prefix="baseline-training-data/"):

    # Incremental id assigned to each image-text pair
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        image_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []

        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Get the description
            description = get_text(src_bucket, key)
            # Get the embeddings of the description
            q_vec = embed_text(model_text, tokenizer, description)
            # Apply the similarity search using the description
            res_image = collection.query(
                query_embeddings=[q_vec],
                n_results=1,
                where={"type": "image"}, # Filter by metadata type
                include=["documents", "distances"]
            )
            # Get the key for the image
            key_image = res_image['documents'][0][0][len(src_bucket) + 1:]

            # Remove the prefix part from the key
            new_key_text = dest_prefix + "text_" + str(id_counter).zfill(6) + ".txt" # ids of 000001, 000002, ...
            new_key_image = dest_prefix + "image_" + str(id_counter).zfill(6) + ".png" # ids of 000001, 000002, ...

            # Copy objects without top-level folder and rename them
            copy_source_text = {"Bucket": src_bucket, "Key": key}
            copy_source_image = {"Bucket": src_bucket, "Key": key_image}
            s3.copy_object(Bucket=dest_bucket, Key=new_key_text, CopySource=copy_source_text)
            s3.copy_object(Bucket=dest_bucket, Key=new_key_image, CopySource=copy_source_image)

            print(f"✅ Baseline training pair #{id_counter} created successfully.")

    print(f"✅ All training pairs have been successfully created.")