In [1]:
# !pip install git+https://github.com/openai/CLIP.git

In [2]:
# !pip install datasets


## import required modules

In [3]:
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
import clip
import pandas as pd
from datasets import Dataset
import openai
from openai import OpenAI
import os

Load dataset. I have a csv file which contains image path and the corresponding description

In [None]:
csv_file= 'csv_file_path'
df = pd.read_csv(csv_file)
df = df.head(10)
display(df)
data = df.to_dict(orient='records')
print(data)
dataset = Dataset.from_list(data)
dataset

Unnamed: 0,image_path,caption
0,/content/drive/MyDrive/vlm/vlm_data/image/Abno...,Cervical histopathology image showing abnormal...
1,/content/drive/MyDrive/vlm/vlm_data/image/Norm...,Cervical biopsy slide showing no malignant cel...
2,/content/drive/MyDrive/vlm/vlm_data/image/Abno...,Histopathology scan revealing early-stage cerv...
3,/content/drive/MyDrive/vlm/vlm_data/image/Norm...,Cervical histopathology image showing normal c...
4,/content/drive/MyDrive/vlm/vlm_data/image/Abno...,A microscopic image of cervical tissue with mi...
5,/content/drive/MyDrive/vlm/vlm_data/image/Norm...,Microscopic image. No evidance of cancer
6,/content/drive/MyDrive/vlm/vlm_data/image/Abno...,It seems there is some spot of malignant cell ...
7,/content/drive/MyDrive/vlm/vlm_data/image/Norm...,No evidance of cancer. Completely normal
8,/content/drive/MyDrive/vlm/vlm_data/image/Abno...,The microscopic image says there is a big patc...
9,/content/drive/MyDrive/vlm/vlm_data/image/Norm...,It seems no malignant cells. Tissues are very ...


[{'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Abnormal-0002.png', 'caption': 'Cervical histopathology image showing abnormal squamous cells'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Normal-0001.png', 'caption': 'Cervical biopsy slide showing no malignant cell clusters'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Abnormal-0003.png', 'caption': 'Histopathology scan revealing early-stage cervical carcinoma'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Normal-0002.png', 'caption': 'Cervical histopathology image showing normal cell. There is no trace of malignant cell'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Abnormal-0004.png', 'caption': 'A microscopic image of cervical tissue with mild dysplasia'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Normal-0003.png', 'caption': 'Microscopic image. No evidance of cancer'}, {'image_path': '/content/drive/MyDrive/vlm/vlm_data/image/Abnormal-0005.png', 'c

Dataset({
    features: ['image_path', 'caption'],
    num_rows: 10
})

Load the BLIP model

In [5]:
model_name_fine_tune_blip = "Salesforce/blip-image-captioning-large"
processor_blip_fine_tune = BlipProcessor.from_pretrained(model_name_fine_tune_blip)
model_blip_fine_tune = BlipForConditionalGeneration.from_pretrained(model_name_fine_tune_blip)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

## Process the data
### For each image, compress the image to (16,16) shape (one can use PCA to minimuze information loss) and conterted to tensor, ids ad labels

In [None]:
from torchvision import transforms

def preprocess_data(example):
    image = Image.open(example["image_path"]).convert("RGB")
    image_resize = image.resize((16,16))

    processor_blip_fine_tune.image_processor.size = {"height": 16, "width": 16}
    inputs = processor_blip_fine_tune(images=image_resize, text=example["caption"], return_tensors="pt", padding="max_length", truncation=True)
    return {
        "pixel_values": inputs["pixel_values"].squeeze(),
        "input_ids": inputs["input_ids"].squeeze(),
        "labels": inputs["input_ids"].squeeze()
    }

train_dataset = dataset.map(preprocess_data)
train_dataset.set_format(type="torch", columns=["pixel_values", "input_ids", 'labels'])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

## Fine-tune the BLIP model and save it
### Keep batch size, epoch size to 1 as this is free open-source model. If we take a large batch and epoch size then the collab will crash because of low memory.

In [None]:
output_dir = "./blip-finetuned"
training_args = TrainingArguments(
    output_dir= output_dir,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_dir="./logs",
    save_steps=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    fp16=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model_blip_fine_tune,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

model_blip_fine_tune.save_pretrained(output_dir)
processor_blip_fine_tune.save_pretrained(output_dir)

print("Fine-tuned model and processor saved successfully!")


## Load the fine-tuned model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "set path where to save the fine-tuned model with model name"
processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path).to(device)

model.eval()

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-23): 24 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1024,),

### Load the image for which we want to test, and convert the image to appropriate format.

In [None]:
test_image_path = "test image path"
image = Image.open(test_image_path).convert("RGB")
test_image = image.resize((16,16))
inputs = processor(images=test_image, return_tensors="pt").to(device)

### Generate caption for testing image by fine-tuned model.

In [10]:
with torch.no_grad():
    output = model.generate(**inputs)
caption = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Generated Caption:", caption)

Generated Caption: contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain contain


### Improve the caption by passing it to an LLM model, and repeat this process until get desired accuracy or no improvment.

In [None]:
client = OpenAI(
  api_key="openai_api_key"
)

def improve_caption(caption, context="Cervical histopathology image"):
    prompt = f"Context: {context}\nImprove this caption: '{caption}' to make it more accurate and informative."
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

model_clip, preprocess_clip = clip.load("ViT-B/32", device=device)
image_test = preprocess_clip(Image.open("test image path")).unsqueeze(0).to(device)

def evaluate_caption(image, caption):
  text = clip.tokenize([caption]).to(device)
  with torch.no_grad():
      image_features = model_clip.encode_image(image)
      text_features = model_clip.encode_text(text)
      image_features /= image_features.norm(dim=-1, keepdim=True)
      text_features /= text_features.norm(dim=-1, keepdim=True)

      similarity = (image_features @ text_features.T).squeeze()
  return similarity.item()

initial_caption = caption
initial_score = evaluate_caption(image_test, initial_caption)
improved_caption = improve_caption(initial_caption)
improved_score = evaluate_caption(image_test, improved_caption)
print("Improved Caption:", improved_caption)

while abs(initial_score-improved_score) > 0.01:
  print('Improved caption:', improve_caption)
  if improved_score > initial_score:
      initial_score = improved_score
      improved_caption = improve_caption(improved_caption)
      improved_score = evaluate_caption(image_test, improved_caption)
print(f"Final Caption: {improved_caption} | Accuracy: {improved_score:.4f}")

100%|███████████████████████████████████████| 338M/338M [00:14<00:00, 23.7MiB/s]


Improved Caption: "Microscopic view of cervical histopathology, showcasing key cellular features and structures indicative of potential lesions or abnormalities."
Improved caption: <function improve_caption at 0x78ef1acf1c60>
Final Caption: "Microscopic view of cervical histopathology displaying distinct cellular features and structural alterations that may indicate the presence of lesions or abnormalities, including changes in cell morphology and organization typical of dysplastic or neoplastic conditions." | Accuracy: 0.3024
