In [1]:
!pip install transformers
!pip install git+https://github.com/openai/CLIP.git
!pip install tqdm

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-5wbphyig
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-5wbphyig
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidi

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!rm -rf /content/drive/MyDrive/image

In [10]:
import struct
from struct import unpack
import matplotlib.pyplot as plt
from PIL import Image
import os

list_image_path = []
list_txt = []

# Upack the drawing data
def unpack_drawing(file_handle):
    key_id, = unpack('Q', file_handle.read(8)) # Read 8 bytes as an unsigned long long
    country_code, = unpack('2s', file_handle.read(2))
    recognized, = unpack('b', file_handle.read(1))
    timestamp, = unpack('I', file_handle.read(4))
    n_strokes, = unpack('H', file_handle.read(2)) # Read 2 bytes as an unsigned short
    image = [] # Store the strokes
    for i in range(n_strokes):
        n_points, = unpack('H', file_handle.read(2))
        fmt = str(n_points) + 'B'
        x = unpack(fmt, file_handle.read(n_points))
        y = unpack(fmt, file_handle.read(n_points))
        y = tuple(255 - yi for yi in y)  # Invert the image
        image.append((x, y)) # Append the stroke to the image

    return {
        'key_id': key_id,
        'country_code': country_code.decode(),
        'recognized': recognized,
        'timestamp': timestamp,
        'image': image
    }

# Unpack all the drawings in a file
def unpack_drawings(filename):
    with open(filename, 'rb') as f:
      while True:
        try:
          yield unpack_drawing(f)
        except struct.error:
          break

# Plot the drawing
def plot_drawing(drawing):
    if isinstance(drawing, torch.Tensor):
        drawing = drawing.cpu().numpy()

    if drawing.ndim == 2:  # Deal with 2D tensors
        fig, ax = plt.subplots()
        for x, y in drawing:
            ax.plot(x, y)
        plt.show()
    elif drawing.ndim == 4:  # Deal with 4D tensors
        batch_size = drawing.shape[0]
        for i in range(batch_size):
            img = drawing[i].transpose(1, 2, 0)  # Change the shape from (1, 28, 28) to (28, 28, 1)
            img = (img - img.min()) / (img.max() - img.min())  # Normalize the image

            plt.imshow(img)
            plt.title(f"Image {i+1}")
            plt.axis('off')
            plt.show()

def save_drawing_as_image(drawing, folder_path, image_index):
    image = Image.new('L', (256, 256), 255) # Create a white image
    pixels = image.load()

    for x, y in drawing['image']:
        for i in range(len(x)):
          pixels[x[i], y[i]] = 0 # Set the pixel to black
    image_path = os.path.join(folder_path, f'image_{image_index}.png')
    image.save(image_path)

    # choose the first image of each category
    if image_index == 0:
        list_image_path.append(image_path)
        folder_name = os.path.basename(folder_path)
        list_txt.append(folder_name)
    print(f"Saved image {image_index}.jpg to {folder_path}")

def process_bin_files(bin_folder, image_folder, max_images_per_type = 100, num = 1):
    for bin_file in os.listdir(bin_folder):
        if bin_file.endswith('.bin'):
            bin_file_path = os.path.join(bin_folder, bin_file)
            drawing_name = os.path.splitext(bin_file)[0]

            drawing_folder_path = os.path.join(image_folder, drawing_name)
            os.makedirs(drawing_folder_path, exist_ok=True)


            print(f"Processing {bin_file_path}, {num} / 345")
            num += 1

            for image_index, drawing in enumerate(unpack_drawings(bin_file_path)):
                if image_index >= max_images_per_type:
                  break
                save_drawing_as_image(drawing, drawing_folder_path, image_index)

bin_folder = '/content/drive/MyDrive/binary'
image_folder = '/content/drive/MyDrive/image'
os.makedirs(image_folder, exist_ok=True)
process_bin_files(bin_folder, image_folder)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Saved image 49.jpg to /content/drive/MyDrive/image/umbrella
Saved image 50.jpg to /content/drive/MyDrive/image/umbrella
Saved image 51.jpg to /content/drive/MyDrive/image/umbrella
Saved image 52.jpg to /content/drive/MyDrive/image/umbrella
Saved image 53.jpg to /content/drive/MyDrive/image/umbrella
Saved image 54.jpg to /content/drive/MyDrive/image/umbrella
Saved image 55.jpg to /content/drive/MyDrive/image/umbrella
Saved image 56.jpg to /content/drive/MyDrive/image/umbrella
Saved image 57.jpg to /content/drive/MyDrive/image/umbrella
Saved image 58.jpg to /content/drive/MyDrive/image/umbrella
Saved image 59.jpg to /content/drive/MyDrive/image/umbrella
Saved image 60.jpg to /content/drive/MyDrive/image/umbrella
Saved image 61.jpg to /content/drive/MyDrive/image/umbrella
Saved image 62.jpg to /content/drive/MyDrive/image/umbrella
Saved image 63.jpg to /content/drive/MyDrive/image/umbrella
Saved image 64.jpg to /content/drive/MyDrive/image/umbrella

"\nfor drawing in unpack_drawings('/content/drive/MyDrive/binary/rain.bin'):\n    plot_drawing(drawing)\n"

In [11]:
print(list_image_path)
print(list_txt)
print(len(list_image_path))
print(len(list_txt))

['/content/drive/MyDrive/image/boomerang/image_0.png', '/content/drive/MyDrive/image/aircraft carrier/image_0.png', '/content/drive/MyDrive/image/bottlecap/image_0.png', '/content/drive/MyDrive/image/The Eiffel Tower/image_0.png', '/content/drive/MyDrive/image/bowtie/image_0.png', '/content/drive/MyDrive/image/bracelet/image_0.png', '/content/drive/MyDrive/image/bread/image_0.png', '/content/drive/MyDrive/image/brain/image_0.png', '/content/drive/MyDrive/image/airplane/image_0.png', '/content/drive/MyDrive/image/broccoli/image_0.png', '/content/drive/MyDrive/image/bucket/image_0.png', '/content/drive/MyDrive/image/bridge/image_0.png', '/content/drive/MyDrive/image/bus/image_0.png', '/content/drive/MyDrive/image/bench/image_0.png', '/content/drive/MyDrive/image/arm/image_0.png', '/content/drive/MyDrive/image/butterfly/image_0.png', '/content/drive/MyDrive/image/baseball bat/image_0.png', '/content/drive/MyDrive/image/axe/image_0.png', '/content/drive/MyDrive/image/anvil/image_0.png', '/

In [12]:
import json # Import the json module: handling JSON data
from PIL import Image # Import the image module from the PIL library: image processing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim

import clip
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

# Load the model and corresponding preprocessor for image and text
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
preprocess = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        # Initialize image paths and corresponding texts
        self.image_path = list_image_path
        # Tokenize text using CLIP's tokenizer
        self.title  = clip.tokenize(list_txt)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        # Preprocess image using CLIP's preprocessing function
        image = preprocess(images=Image.open(self.image_path[idx]), return_tensors="pt")["pixel_values"].squeeze(0)
        title = self.title[idx]
        return image, title


dataset = image_title_dataset(list_image_path[:100],list_txt[:100])
BATCH_SIZE = 2 # should greater than 1
train_dataloader = DataLoader(dataset,batch_size = BATCH_SIZE) #Define your own dataloader

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()

if device == "cpu":
  model.float()

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-6, betas = (0.9, 0.98), eps = 1e-6, weight_decay = 0.2)

num_epochs = 300
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total = len(train_dataloader))
    for batch in pbar:
        optimizer.zero_grad()

        images, texts = batch
        images = images.to(device)
        texts = texts.to(device)

        # Forward pass
        outputs = model(pixel_values=images, input_ids=texts)
        logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text


         # Check for nan values in logits
        if torch.isnan(logits_per_image).any() or torch.isnan(logits_per_text).any():
            print("Found nan values in logits")
            continue

        # Calculate loss
        ground_truth = torch.arange(len(images), dtype = torch.long, device = device)
        total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth))/2

        # Backward pass
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # ensure no grad explord

        if device == "cuda":
            optimizer.step()
        else:
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)
        pbar.set_description(f"EPOCH: {epoch}/{num_epochs}, LOSS: {total_loss.item():.9f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

EPOCH: 0/300, LOSS: 0.613207221: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
EPOCH: 1/300, LOSS: 0.631505609: 100%|██████████| 50/50 [00:07<00:00,  6.44it/s]
EPOCH: 2/300, LOSS: 0.667812526: 100%|██████████| 50/50 [00:07<00:00,  6.99it/s]
EPOCH: 3/300, LOSS: 0.631917715: 100%|██████████| 50/50 [00:07<00:00,  6.82it/s]
EPOCH: 4/300, LOSS: 0.576780915: 100%|██████████| 50/50 [00:08<00:00,  5.76it/s]
EPOCH: 5/300, LOSS: 0.482101589: 100%|██████████| 50/50 [00:06<00:00,  7.28it/s]
EPOCH: 6/300, LOSS: 0.388072282: 100%|██████████| 50/50 [00:07<00:00,  6.44it/s]
EPOCH: 7/300, LOSS: 0.313265145: 100%|██████████| 50/50 [00:07<00:00,  7.10it/s]
EPOCH: 8/300, LOSS: 0.274166286: 100%|██████████| 50/50 [00:07<00:00,  6.51it/s]
EPOCH: 9/300, LOSS: 0.252338111: 100%|██████████| 50/50 [00:07<00:00,  7.07it/s]
EPOCH: 10/300, LOSS: 0.227543026: 100%|██████████| 50/50 [00:07<00:00,  6.28it/s]
EPOCH: 11/300, LOSS: 0.210558996: 100%|██████████| 50/50 [00:07<00:00,  7.14it/s]
EPOCH: 12/300, LOSS: 0.192

In [17]:
# Save the model
checkpoint_dir = "model_checkpoint"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
    }, f"model_checkpoint/model_10.pt")

In [28]:
# Load the model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
modeltuning = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
preprocess = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

checkpoint = torch.load("model_checkpoint/model_10.pt")
modeltuning.load_state_dict(checkpoint['model_state_dict'])

model.eval()
modeltuning.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
modeltuning.to(device)


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [64]:
# preprocess the test images and texts
test_image_path = list_image_path[:50]
test_texts = list_txt[:50]

# preprocess the image
test_images = [Image.open(img_path).convert("RGB") for img_path in test_image_path]
test_images_preprocessed = torch.cat([preprocess(images=img, return_tensors="pt")["pixel_values"] for img in test_images]).to(device)

# preprocess the text
test_texts_preprocessed = clip.tokenize(test_texts).to(device)




In [65]:
# without Tuning
with torch.no_grad():
    outputs = model(pixel_values=test_images_preprocessed, input_ids=test_texts_preprocessed)
    logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text

# with Tuning
with torch.no_grad():
    outputstuning = modeltuning(pixel_values=test_images_preprocessed, input_ids=test_texts_preprocessed)
    logits_per_imagetuning, logits_per_texttuning = outputstuning.logits_per_image, outputstuning.logits_per_text


def check_success(logits):
    success_count = 0
    for m, image_list in enumerate(logits.tolist()):
        max_similarity = max(image_list)
        for n, similarity in enumerate(image_list):
            #print(f"Similarity between image {m + 1} and description {n + 1}: {similarity}\n")
            if m == n and similarity == max_similarity:
                success_count += 1
    return success_count

print("Before Tuning: \n")
success_count_of_image =check_success(logits_per_image)
success_count_of_text = check_success(logits_per_text)
print(f"\nCount of successful recognitions: {success_count_of_image + success_count_of_text}\n")


print("\nAfter Tuning: \n")
success_count_of_image = check_success(logits_per_imagetuning)
success_count_of_text = check_success(logits_per_texttuning)
print(f"\nCount of successful recognitions: {success_count_of_image + success_count_of_text}\n")

Before Tuning: 


Count of successful recognitions: 2


After Tuning: 


Count of successful recognitions: 1

