In [1]:
import os
import json
import os
import pathlib
from typing import Generator
import pandas as pd
                
def convert_text_image_pairs_to_huggingface_json(root_csv, out_json):
    # out_folder = os.path.dirname(root_folder)
    # pathlib.Path(out_folder).mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(root_csv)
    root_path = '/projectnb/herbdl/data/kaggle-herbaria/herbarium-2022/train_images/'
    with open(out_json, "w") as f:
        written_count = 0
        for index, row in df.iterrows():
            line_dict = {"image": root_path+row['filename'], "caption":row['label']}
            json_line = json.dumps(line_dict, indent=None, separators=(",",":"))
            #print(json_line)
            f.write(json_line + "\n")
            written_count += 1
        print(f"wrote {written_count} lines to {out_json}")

## Convert the data folder of text/image pairs to a huggingface dataset-compatible json

Replace `root_folder` in the next cell with the top-level folder containing your images, and `out_json` with a path to where the json file representing the image/caption pairs in that folder should be saved.

Note this only works with pairs of the form `filename.jpg`/`filename.txt` or `filename.jpeg`/`filename.txt`.

In [4]:
root_csv = "/projectnb/herbdl/data/kaggle-herbaria/train_2022_labeled.csv"
out_json = "/projectnb/herbdl/workspaces/smritis/finetuning/training/pairs.json"
# convert_text_image_pairs_to_huggingface_json(root_csv, out_json)

Test that it worked by running the following cell:

In [5]:
# test loading it back in
from datasets import load_dataset
dataset = load_dataset("json", data_files=out_json)
print(f"first image: {dataset['train'][0]['image']}, caption: '{dataset['train'][0]['caption']}'")

first image: /projectnb/herbdl/data/kaggle-herbaria/herbarium-2022/train_images/000/00/00000__001.jpg, caption: 'This is an image of species amabilis, in the genus Abies of family Pinaceae. It is part of the collection of institution A.'


## Run the finetuning

### Configuration

`repo_id` - The starting point for finetuning. By default this uses the `openai/clip-vit-large-patch14-336` pre-trained CLIP weights. This is what Stable Diffusion versions up to 1.5 used. Another option you might want to consider is `laion/CLIP-ViT-H-14-laion2B-s32B-b79K`, which was used for Stable Diffusion 2.0 onwards.

`output_folder` - Where to store the output. The saving process writes multiple files to this folder, so it should be empty.

`batch_size` - Training batch size. Don't go lower than 8 - try 32 or 64 (unless you only have a few images).

`num_train_epochs` - How many epochs to train. With <500 images each epoch on a 3090 takes a few minutes - do a small number, say `3` to start with, and check the loss when it's done before increasing the number of epochs. With 3 epochs my loss went down to around 2. After 10 epochs it was down to 0.63. Be careful not to over-fit.

In [6]:
repo_id =  "openai/clip-vit-large-patch14-336"
output_folder = "/projectnb/herbdl/workspaces/smritis/finetuning/output/finetuned-kaggle-2022"
batch_size = 8
num_train_epochs = 1

In [7]:
print(f"Finetuning {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {output_folder}.")
!python CLIP_finetuning.py \
    --output_dir {output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {out_json} \
    --image_column image \
    --overwrite_output_dir=True \
    --max_seq_length=77 \
    --num_train_epochs={num_train_epochs} \
    --caption_column caption \
    --remove_unused_columns=False \
    --do_train \
    --per_device_train_batch_size={batch_size} \
    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 
print("--\nDONE")
print(f"If it worked, trained data should be in {output_folder}")

Finetuning openai/clip-vit-large-patch14-336 for 1 epochs with batch size 8, and then saving output to /projectnb/herbdl/workspaces/smritis/finetuning/output/finetuned-kaggle-2022.
Filter:  29%|█████▏            | 243000/839772 [41:05<2:18:10, 71.98 examples/s]^C
Filter:  29%|█████▏            | 243000/839772 [41:11<1:41:08, 98.34 examples/s]
Traceback (most recent call last):
  File "/projectnb/herbdl/workspaces/smritis/finetuning/CLIP_finetuning.py", line 537, in <module>
    main()
  File "/projectnb/herbdl/workspaces/smritis/finetuning/CLIP_finetuning.py", line 431, in main
    train_dataset = train_dataset.filter(
  File "/projectnb/herbdl/workspaces/smritis/.venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 567, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/projectnb/herbdl/workspaces/smritis/.venv/lib/python3.10/site-packages/datasets/fingerprint.py", line 482, in wrapper
    out = func(dataset, *args, **kwargs)
  Fil

If it all worked, your finetuned CLIP model is in the `output_folder` defined above.