In [1]:
import os
import json
import os
import pathlib
import pandas as pd
from typing import Generator
from sklearn.model_selection import train_test_split


def convert_text_image_pairs_to_huggingface_json(root_csv, out_json):

    if type(root_csv) is str:
        df = pd.read_csv(root_csv)
    else:
        df = root_csv

    root_path = '/mnt/purenfs/data/herbaria/train_images/'
    with open(out_json, "w") as f:
        written_count = 0
        for index, row in df.iterrows():
            line_dict = {"image": root_path+row['filename'], "caption":row['label']}
            json_line = json.dumps(line_dict, indent=None, separators=(",",":"))
            #print(json_line)
            f.write(json_line + "\n")
            written_count += 1
        print(f"wrote {written_count} lines to {out_json}")

## Convert the data folder of text/image pairs to a huggingface dataset-compatible json

Replace `root_folder` in the next cell with the top-level folder containing your images, and `out_json` with a path to where the json file representing the image/caption pairs in that folder should be saved.

Note this only works with pairs of the form `filename.jpg`/`filename.txt` or `filename.jpeg`/`filename.txt`.

Kaggle 2022 Train and Val splits

In [2]:
root_csv = "train_2022_labeled.csv"

In [3]:
train22 = pd.read_csv(root_csv)

def label_to_taxons(label):
    label = label.split(" ")
    species, genus, family = label[6][:-1], label[10], label[13][:-1]

    return species, genus, family

train22['species'], train22['genus'], train22['family'] = zip(*train22['label'].map(label_to_taxons))
train22['label'] = train22.apply(lambda x: x['family'] + ' ' + x['genus'] + ' ' + x['species'], axis=1)
train22 = train22.drop(columns=['Unnamed: 0', 'species', 'genus', 'family'])

train22.head()

Unnamed: 0,image_id,filename,label
0,00000__001,000/00/00000__001.jpg,Pinaceae Abies amabilis
1,00000__002,000/00/00000__002.jpg,Pinaceae Abies amabilis
2,00000__003,000/00/00000__003.jpg,Pinaceae Abies amabilis
3,00000__004,000/00/00000__004.jpg,Pinaceae Abies amabilis
4,00000__005,000/00/00000__005.jpg,Pinaceae Abies amabilis


In [None]:
train, val = train_test_split(train22, test_size=0.2, random_state=42)

#train.to_csv("/projectnb/herbdl/data/kaggle-herbaria/train_2022_scientific.csv", index=False)
#val.to_csv("/projectnb/herbdl/data/kaggle-herbaria/val_2022_scientific.csv", index=False)



train.shape, val.shape

((671817, 3), (167955, 3))

In [5]:
# label encode the label
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train['label'])
train['label'] = label_encoder.transform(train['label'])
val['label'] = label_encoder.transform(val['label'])

convert_text_image_pairs_to_huggingface_json(train, "train_22_encoded.json")
convert_text_image_pairs_to_huggingface_json(val, "val_22_encoded.json")

train.shape, val.shape

wrote 671817 lines to train_22_encoded.json
wrote 167955 lines to val_22_encoded.json


((671817, 3), (167955, 3))

In [6]:
# take small subset of train and val and save it
train_subset = train.sample(n=1000, random_state=42)
val_subset = val.sample(n=200, random_state=42)

train_subset.to_csv("/projectnb/herbdl/data/kaggle-herbaria/train_2022_labeled2.csv", index=False)
val_subset.to_csv("/projectnb/herbdl/data/kaggle-herbaria/val_2022_labeled2.csv", index=False)

In [None]:
train.head()

Unnamed: 0,image_id,filename,label
686446,12640__012,126/40/12640__012.jpg,Roldana cordovensis
83405,01612__004,016/12/01612__004.jpg,Astragalus sophoroides
155613,02962__081,029/62/02962__081.jpg,Carex whitneyi
455876,08401__098,084/01/08401__098.jpg,Lilium philadelphicum
141438,02707__054,027/07/02707__054.jpg,Carex hoodii


In [None]:
from transformers import AutoTokenizer

MODEL_CPKT = "openai/clip-vit-large-patch14-336"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CPKT, cache_dir="../kaggle_eval/")

labels_tokenized = tokenizer(val['label'].tolist(), padding=True, truncation=True, return_tensors="pt")
labels_tokenized['input_ids'].shape

torch.Size([167955, 14])

In [4]:
train_json = "/projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/train.json"
val_json = "/projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/val.json"

convert_text_image_pairs_to_huggingface_json("/projectnb/herbdl/data/kaggle-herbaria/train_2022_labeled2.csv", train_json)
convert_text_image_pairs_to_huggingface_json("/projectnb/herbdl/data/kaggle-herbaria/val_2022_labeled2.csv", val_json)

wrote 671817 lines to /projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/train.json
wrote 167955 lines to /projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/val.json


Test that it worked by running the following cell:

In [None]:
# test loading it back in
from datasets import load_dataset
train_dataset = load_dataset("json", data_files=train_json)
val_dataset = load_dataset("json", data_files=val_json)

len(train_dataset['train']), len(val_dataset['train'])

(671817, 167955)

## Run the finetuning

### Configuration

`repo_id` - The starting point for finetuning. By default this uses the `openai/clip-vit-large-patch14-336` pre-trained CLIP weights. This is what Stable Diffusion versions up to 1.5 used. Another option you might want to consider is `laion/CLIP-ViT-H-14-laion2B-s32B-b79K`, which was used for Stable Diffusion 2.0 onwards.

`output_folder` - Where to store the output. The saving process writes multiple files to this folder, so it should be empty.

`batch_size` - Training batch size. Don't go lower than 8 - try 32 or 64 (unless you only have a few images).

`num_train_epochs` - How many epochs to train. With <500 images each epoch on a 3090 takes a few minutes - do a small number, say `3` to start with, and check the loss when it's done before increasing the number of epochs. With 3 epochs my loss went down to around 2. After 10 epochs it was down to 0.63. Be careful not to over-fit.

In [None]:
repo_id =  "openai/clip-vit-large-patch14-336"
output_folder = "/projectnb/herbdl/workspaces/faridkar/finetuning/output/finetuned-kaggle-2022-05-06"
batch_size = 8
num_train_epochs = 1

train_json = "/projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/train.json"
val_json = "/projectnb/herbdl/workspaces/faridkar/herbdl/finetuning/val.json"

In [None]:
print(f"Finetuning {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {output_folder}.")
!python CLIP_finetuning.py \
    --output_dir {output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {train_json} \
    --validation_file {val_json} \
    --image_column image \
    --overwrite_output_dir=True \
    --max_seq_length=35 \
    --num_train_epochs={num_train_epochs} \
    --caption_column caption \
    --remove_unused_columns=False \
    --do_train \
    --do_eval \
    --per_device_train_batch_size={batch_size} \
    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 
print("--\nDONE")
print(f"If it worked, trained data should be in {output_folder}")

Finetuning openai/clip-vit-large-patch14-336 for 1 epochs with batch size 8, and then saving output to /projectnb/herbdl/workspaces/faridkar/finetuning/output/finetuned-kaggle-2022-05-06.


Generating train split: 671817 examples [00:00, 1802476.77 examples/s]
Generating validation split: 167955 examples [00:00, 1825484.14 examples/s]
Filter: 100%|██████████████████| 671817/671817 [03:53<00:00, 2880.56 examples/s]
Running tokenizer on train dataset: 100%|█| 671817/671817 [00:12<00:00, 55596.95
Filter: 100%|██████████████████| 167955/167955 [00:58<00:00, 2868.23 examples/s]
Running tokenizer on validation dataset: 100%|█| 167955/167955 [00:02<00:00, 564
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
{'loss': 1.9976, 'learning_rate': 4.940460596822978e-05, 'epoch': 0.01}         
{'loss': 1.8225, 'learning_rate': 4.880921193645955e-05, 'epoch': 0.02}         
{'loss': 1.6873, 'learning_rate': 4.8213817904689324e-05, 'epoch': 0.04}        
{'loss': 1.5664, 'learning_rate': 4.76184238729191e-05, 'epoch': 0.05}          
{'loss': 1.4547, 'learning_rate': 4.702302984114887e-05, 'epoch': 0.06}         
  7%|██▌                                 | 2956/41989 [39

If it all worked, your finetuned CLIP model is in the `output_folder` defined above.