Weights & Biasesを使って、CLIP-Vit-Largeモデルのハイパーパラメータチューニングを行うためのNotebook

# Module

In [1]:
import os
from glob import glob
import pickle
from pathlib import Path
from PIL import Image
import cv2

import numpy as np
import pandas as pd
import math
from scipy import spatial
import random
from tqdm.notebook import tqdm
import wandb

from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.optim as optim
from torchvision import transforms
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoProcessor
from transformers import TrainingArguments, Trainer

import sys
# pathの追加
# sys.path.append('./sentence-transformers-2-2-2/sentence-transformers')
from sentence_transformers import SentenceTransformer, models
# sentence transformerの構築
st_model = SentenceTransformer('./sentence-transformers-2-2-2/all-MiniLM-L6-v2')

In [6]:
# !pip install -r requirements.txt

In [2]:
# clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using', device)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn

using cuda


# Functions

In [3]:
def cosine_similarity_loss(pred, target):
    # nn.CosineSimilarity()は二つのtensorが同じ向きであれば1を返す
    cos = nn.CosineSimilarity(dim=1)
    output = -cos(pred, target).mean()
    return output

def cosine_similarity(y_trues, y_preds):
    # 1 - spa...は見た目的にはlossを求めているように見えるが，
    # 実際には1 - (1 - cos_sim) = cos_simである．
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
    # を参照のこと
    return np.mean([
        1 - spatial.distance.cosine(y_true, y_pred) 
        for y_true, y_pred in zip(y_trues, y_preds)
    ])

def denoising(image_path, DENOISING=None, KERNEL=1):
    if DENOISING == 'smooth':
        image = cv2.imread(image_path)
        image = cv2.blur(image, (KERNEL, KERNEL))
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    elif DENOISING == 'median':
        image = cv2.imread(image_path)
        image = cv2.medianBlur(image, KERNEL)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    elif DENOISING == 'gaussian':
        image = cv2.imread(image_path)
        image = cv2.GaussianBlur(image, (KERNEL, KERNEL), 0)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    else:
        image = Image.open(image_path)
    
    return image

transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

# Dataset & DataLoader

In [9]:
# training data
train_df = pd.read_csv('./Datasets/2M/diffusiondb_154318.csv')
train_df

# validation data
# https://www.kaggle.com/code/shoheiazuma/diffusiondb-data-cleansing/comments
val_df1 = pd.read_csv('./Datasets/30K/modified_all_30K.csv')
val_df2 = pd.read_csv('./Datasets/gustavosta-stable-diffusion-prompts-sd2-v2/cleansed_80K.csv')
val_df3 = pd.read_csv('./Datasets/Generated/000000-006532/000000-006532_pairs.csv')
val_df4 = pd.read_csv('./Datasets/Generated/006533-011027/006533-011027.csv')

val_df1 = val_df1[['prompt', 'image_path']]
val_df2 = val_df2[['prompt', 'image_path']]
val_df3 = val_df3[['prompt', 'image_path']]
val_df4 = val_df4[['prompt', 'image_path']]

val_df = pd.concat([val_df1, val_df2, val_df3, val_df4],
                   axis=0)
val_df.reset_index(drop=True, inplace=True)
val_df = val_df.sample(n=15000,
                        random_state=42,
                        axis=0)

In [9]:
train_images = train_df['image_path'].tolist()
train_prompts = train_df['prompt'].tolist()
val_images = val_df['image_path'].tolist()
val_prompts = val_df['prompt'].tolist()

print(train_prompts[0])
print(type(train_prompts[0]))

a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation
<class 'str'>


In [10]:
image = denoising(train_images[0], 'None', 3)
print(type(image))
print(image.size)

<class 'PIL.PngImagePlugin.PngImageFile'>
(512, 512)


In [11]:
# for image in val_images:
#     Image.open(image)

In [12]:
image = Image.open(train_images[0])
print(image)
# image = clip_processor(images=image)['pixel_values'][0]
# print(image.shape)
image = transform(image)
print(image)
print(image.shape)

<PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512 at 0x238948C6A40>
tensor([[[-0.5082, -0.4911, -0.5082,  ..., -0.5424, -0.5424, -0.5596],
         [-0.4911, -0.4739, -0.4739,  ..., -0.5596, -0.5082, -0.5253],
         [-0.4911, -0.5082, -0.4911,  ..., -0.5082, -0.5082, -0.5082],
         ...,
         [-0.7137, -0.6965, -0.7308,  ..., -1.4672, -0.3541,  0.0569],
         [-0.7137, -0.7137, -0.7137,  ..., -1.5699, -0.7650,  0.0741],
         [-0.7137, -0.7308, -0.6965,  ..., -1.7240, -1.2788, -0.0287]],

        [[-0.3200, -0.3025, -0.3200,  ..., -0.3200, -0.3375, -0.3550],
         [-0.3200, -0.3200, -0.3025,  ..., -0.3550, -0.3025, -0.3025],
         [-0.3200, -0.3550, -0.3200,  ..., -0.3025, -0.3025, -0.3025],
         ...,
         [-0.5476, -0.5126, -0.5476,  ..., -1.2654, -0.5301, -0.3901],
         [-0.5301, -0.5301, -0.5476,  ..., -1.2829, -0.7927, -0.3725],
         [-0.5301, -0.5476, -0.5126,  ..., -1.4405, -1.1779, -0.3200]],

        [[-0.1661, -0.1487, -0.1487, 

In [13]:
# def get_train_test_split():
#     """add your image paths and embedding labels here"""
#     train_images = train_images
#     train_labels = train_prompts
#     test_images = val_images
#     test_labels = val_prompts
#     return train_images, train_labels, test_images, test_labels

# class IMGDataset(Dataset):
#     def __init__(self, image_paths, targets, clip_processor=clip_processor, transform=transform):
#         self.images = image_paths
#         self.labels = targets
#         self.input_processor = clip_processor
#         self.transform = transform

#     def __len__(self):
#         return len(self.images)

#     def __getitem__(self, item):
#         # image
#         image = Image.open(self.images[item])
#         image = self.transform(image)
        
#         # text
#         target = self.labels[item]
        
#         return image, target

# class Collator:
#     def __init__(self):
#         self.st_model = st_model
    
#     def __call__(self, batch):
#         images, prompts = zip(*batch)
#         images = torch.stack(images)
#         embeddings = self.st_model.encode(prompts,
#                                           show_progress_bar=False,
#                                           convert_to_tensor=True)
        
#         return images, embeddings

# Model

In [14]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        clip = AutoModel.from_pretrained("openai/clip-vit-large-patch14")
        self.vision = clip.vision_model
        self.fc = nn.Linear(1024, 384)

    def forward(self, x):
        out = self.vision(x)['pooler_output']
        return self.fc(out)

def load_pretrained_model(UNFREEZE_START=0):
    model = Net()

    trainable_model_weights = False
    for name, child in model.named_children():
        if name == 'vision':
            for pn, p in child.named_parameters():
                if str(UNFREEZE_START) in pn:
                    """start unfreezing layer , the weights are trainable"""
                    trainable_model_weights = True
                p.requires_grad = trainable_model_weights
                # if p.requires_grad:
                #     print(f"{pn} is set to be trainable.")

    return model.to(device)

構築するモデルのアーキテクチャ：  

Net(  
  (vision): CLIPVisionTransformer(  
    (embeddings): CLIPVisionEmbeddings(  
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)  
      (position_embedding): Embedding(257, 1024)  
    )  
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
    (encoder): CLIPEncoder(  
      (layers): ModuleList(  
        (0-23): 24 x CLIPEncoderLayer(  
          (self_attn): CLIPAttention(  
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)  
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)  
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)  
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)  
          )  
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
          (mlp): CLIPMLP(  
            (activation_fn): QuickGELUActivation()  
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)  
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)  
          )  
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
        )  
      )  
    )  
    (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
  )  
  (fc): Linear(in_features=1024, out_features=384, bias=True)  
)  

# Tuning

以下のコードは、

- [WandBでハイパラチューニングもしちゃおう！](https://qiita.com/ryo3568/items/5ef6ec8b5bba163bcf54)
- [Weights & Biasesを使用したデータサイエンス実験管理](https://wandb.ai/wandb_fc/japanese/reports/Weights-Biases---Vmlldzo4MDI5MTA) - これは要復習

を参考にした．

In [15]:
# 使用する乱数のseedを固定
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

## Sweep Configulation

### Base

In [16]:
# # Base
# sweep_config = dict()

# # metric
# metric = dict()
# metric['name'] = 'val_similarity'
# metric['goal'] = 'maximize'
# metric['target'] = 0.65

# # parameters
# parameters = dict()
# parameters['epochs'] = {'value': 10}
# parameters['batch_size'] = {'value': 30}
# ## model parameter
# parameters['unfreeze_start'] = {
#     'distribution': 'int_uniform', 
#     'min': 21,
#     'max': 22,
# }
# ## optimizer parameters
# parameters['learning_rate'] = {
#     'distribution': 'uniform',
#     'min': 0.0030,
#     'max': 0.0067
# }
# parameters['weight_decay'] = {
#     'distribution': 'uniform',
#     'min': 0.04,
#     'max': 0.07
# }

# # early stopping
# early_stopping = dict()
# early_stopping['type'] = 'hyperband'
# early_stopping['max_iter'] = 5
# early_stopping['s'] = 3

# sweep_config['method'] = 'bayes'
# sweep_config['metric'] = metric
# sweep_config['parameters'] = parameters
# sweep_config['early_terminate'] = early_stopping
# sweep_config

### Base + Denoising

In [17]:
# # Base + Denoising

# sweep_config = dict()

# # metric
# metric = dict()
# metric['name'] = 'val_similarity'
# metric['goal'] = 'maximize'
# metric['target'] = 0.99

# # parameters
# parameters = dict()
# parameters['epochs'] = {'value': 1000}
# parameters['batch_size'] = {'value': 30}

# ## denoising parameters
# # parameters['denoising'] = {
# #     'distribution': 'categorical',
# #     'values': [
# #         # 'smooth',
# #         # 'median',
# #         'gaussian',
# #         # 'None'
# #     ]
# # }
# parameters['denoising'] = {'value': 'gaussian'}

# # parameters['kernel'] = {
# #     'distribution': 'categorical',
# #     'values': [3, 5, 7, 9]
# # }
# parameters['kernel'] = {'value': 3}

# ## model parameter
# # parameters['unfreeze_start'] = {
# #     'distribution': 'int_uniform', 
# #     'min': 21,
# #     'max': 22,
# # }
# parameters['unfreeze_start'] = {'value': 21}

# ## optimizer parameters
# # parameters['learning_rate'] = {
# #     'distribution': 'uniform',
# #     'min': 0.003,
# #     'max': 0.008
# # }
# parameters['learning_rate'] = {'value': 0.004836150520205524}

# # parameters['weight_decay'] = {
# #     'distribution': 'uniform',
# #     'min': 0.04,
# #     'max': 0.07
# # }
# parameters['weight_decay'] = {'value': 0.05458727423867926}

# # early stopping
# # early_stopping = dict()
# # early_stopping['type'] = 'hyperband'
# # early_stopping['max_iter'] = 5
# # early_stopping['s'] = 2

# sweep_config['method'] = 'bayes'
# sweep_config['metric'] = metric
# sweep_config['parameters'] = parameters
# # sweep_config['early_terminate'] = early_stopping
# sweep_config

### Base + Denoising + Early Stopping

In [18]:
# Base + Denoising + Early Stopping

sweep_config = dict()

# metric
metric = dict()
metric['name'] = 'COUNTER'
metric['goal'] = 'maximize'
metric['target'] = 4

# parameters
parameters = dict()
parameters['epochs'] = {'value': 1000}
parameters['batch_size'] = {'values': [30, 30]}

## denoising parameters
# parameters['denoising'] = {
#     'distribution': 'categorical',
#     'values': [
#         # 'smooth',
#         # 'median',
#         'gaussian',
#         # 'None'
#     ]
# }
parameters['denoising'] = {'value': 'gaussian'}

# parameters['kernel'] = {
#     'distribution': 'categorical',
#     'values': [3, 5, 7, 9]
# }
parameters['kernel'] = {'value': 3}

## model parameter
# parameters['unfreeze_start'] = {
#     'distribution': 'int_uniform', 
#     'min': 21,
#     'max': 22,
# }
parameters['unfreeze_start'] = {'value': 21}

## optimizer parameters
# parameters['learning_rate'] = {
#     'distribution': 'uniform',
#     'min': 0.003,
#     'max': 0.008
# }
parameters['learning_rate'] = {'value': 0.004836150520205524}

# parameters['weight_decay'] = {
#     'distribution': 'uniform',
#     'min': 0.04,
#     'max': 0.07
# }
parameters['weight_decay'] = {'value': 0.05458727423867926}

# early stopping
# early_stopping = dict()
# early_stopping['type'] = 'hyperband'
# early_stopping['max_iter'] = 5
# early_stopping['s'] = 2

sweep_config['method'] = 'bayes'
sweep_config['metric'] = metric
sweep_config['parameters'] = parameters
# sweep_config['early_terminate'] = early_stopping
sweep_config

{'method': 'bayes',
 'metric': {'name': 'COUNTER', 'goal': 'maximize', 'target': 4},
 'parameters': {'epochs': {'value': 1000},
  'batch_size': {'values': [30, 30]},
  'denoising': {'value': 'gaussian'},
  'kernel': {'value': 3},
  'unfreeze_start': {'value': 21},
  'learning_rate': {'value': 0.004836150520205524},
  'weight_decay': {'value': 0.05458727423867926}}}

### val_similarity >= 0.5

In [19]:
# # >=0.5
# sweep_config = dict()

# # metric
# metric = dict()
# metric['name'] = 'val_similarity'
# metric['goal'] = 'maximize'
# metric['target'] = 0.65

# # parameters
# parameters = dict()
# parameters['epochs'] = {'value': 10}
# parameters['batch_size'] = {'value': 30}
# ## model parameter
# parameters['unfreeze_start'] = {'value': 22}
# ## optimizer parameters
# parameters['learning_rate'] = {
#     'distribution': 'uniform',
#     'min': 0.003,
#     'max': 0.007
# }
# parameters['weight_decay'] = {
#     'distribution': 'uniform',
#     'min': 0.05,
#     'max': 0.07
# }

# # early stopping
# early_stopping = dict()
# early_stopping['type'] = 'hyperband'
# early_stopping['max_iter'] = 5
# early_stopping['s'] = 3

# sweep_config['method'] = 'bayes'
# sweep_config['metric'] = metric
# sweep_config['parameters'] = parameters
# sweep_config['early_terminate'] = early_stopping
# sweep_config

https://www.kaggle.com/competitions/stable-diffusion-image-to-prompts/discussion/399549#2210844 より：  
0.9のモメンタムをもつSGDオプティマイザーは大きな画像データセットで使えるらしい．

## Script

課題として、どのデータのsimilarityが低く出るか、もしくは高く出るかも記録させるとよい。

### Base

In [20]:
# # Base （間違えて部分的に書き換えてしまっている）
# def train(config=None):
#     with wandb.init(config=config):
#         #------------------------------------------------------------------------------
#         config = wandb.config
#         EPOCHS = config.epochs
#         BATCH_SIZE = config.batch_size
#         UNFREEZE_START = config.unfreeze_start
#         LEARNING_RATE = config.learning_rate
#         WEIGHT_DECAY = config.weight_decay
#         #------------------------------------------------------------------------------
        
#         model = load_pretrained_model(UNFREEZE_START)    
#         optimizer = optim.AdamW(filter(lambda p: p.requires_grad,
#                                        model.parameters()),
#                                 lr=LEARNING_RATE,
#                                 weight_decay=WEIGHT_DECAY
#                                 # fused=True
#                                 )
#         optimizer.zero_grad()
#         # if config.optimizer == 'sgd':
#         #     optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)
#         # elif config.optimizer == 'adam':
#         #     optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

#         criterion = torch.nn.CosineEmbeddingLoss()
        
#         random.seed(42)
#         shuffled_train_index = random.sample(range(len(train_images)),
#                                              k=len(train_images))
        
#         num_batchs = math.ceil(len(train_images) / BATCH_SIZE)
        
#         # train
#         for epoch in range(EPOCHS):
#             model.train()
#             train_loss = 0
#             train_similarity = 0
            
#             print(f'epoch: {epoch}')
#             for i in tqdm(range(0, len(train_images), BATCH_SIZE)):
#                 indices = shuffled_train_index[i:i+BATCH_SIZE]

#                 # input data -image-
#                 batch_images = []
#                 for index in indices:
#                     image = Image.open(train_images[index])
#                     image = transform(image)
#                     batch_images.append(image)
#                 batch_images = torch.stack(batch_images)
#                 batch_images = batch_images.to(device)

#                 # input data -prompt-
#                 prompt_list = []
#                 for index in indices:
#                     prompt_list.append(train_prompts[index])

#                 targets = st_model.encode(prompt_list, 
#                                                 show_progress_bar=False,
#                                                 convert_to_tensor=True)
#                 preds = model(batch_images)

#                 # back probagation
#                 labels = torch.ones(targets.size(0)).to(device)
#                 loss = criterion(preds, targets, labels)
#                 loss.backward()
#                 optimizer.step()
#                 optimizer.zero_grad()
                
#                 similarity = cosine_similarity(
#                     preds.detach().cpu().numpy(), 
#                     targets.detach().cpu().numpy()
#                     )
#                 train_loss += loss.item()
#                 train_similarity += similarity
#                 similarity = 0
#             train_loss /= num_batchs
#             train_similarity /= num_batchs
#             print(f'train loss: {train_loss} ', end='')
#             print(f'train similarity: {train_similarity}')
            
#             # val
#             model.eval()
#             val_loss = 0
#             val_similarity = 0
#             num_batchs = math.ceil(len(val_images) / BATCH_SIZE)
            
#             with torch.no_grad():
#                 for i in tqdm(range(0, len(val_images), BATCH_SIZE)):

#                     # input data -image-
#                     images = val_images[i:i+BATCH_SIZE]
#                     batch_images = []
#                     for image in images:
#                         image = Image.open(image)
#                         image = transform(image)
#                         batch_images.append(image)
#                     batch_images = torch.stack(batch_images)
#                     batch_images = batch_images.to(device)

#                     # input data -prompt-
#                     batch_prompts = val_prompts[i:i+BATCH_SIZE]
#                     targets = st_model.encode(batch_prompts,
#                                               show_progress_bar=False,
#                                               convert_to_tensor=True)

#                     preds = model(batch_images)
                    
#                     labels = torch.ones(targets.size(0)).to(device)
#                     loss = criterion(preds, targets, labels)
#                     similarity = cosine_similarity(
#                         preds.detach().cpu().numpy(), 
#                         targets.detach().cpu().numpy()
#                     )
                    
#                     val_loss += loss.item()
#                     val_similarity += similarity
#                 val_loss /= num_batchs
#                 val_similarity /= num_batchs
#                 print(f'val loss: {val_loss} ', end='')
#                 print(f'val similarity: {val_similarity}')

#             # Save
#             if val_similarity > BEST_SIMILARITY:
#                 BESTSIM = val_similarity
#                 with open(f'{MODEL_NAME}/BestSim.pickle', 'wb') as f:
#                     pickle.dump(BESTSIM, f)

#                 BESTEPOCH = epoch + 1
#                 print(f"save best model at {BESTSIM} with epoch {BESTEPOCH}")
#                 if SAVE_MODEL_CKP:
#                     torch.save(vit.state_dict(), f"{MODEL_NAME}/local_30K_model.pt")
#                 if SAVE_OPT_CKP:
#                     torch.save(optimizer.state_dict(), f"{MODEL_NAME}/local_30K_opt.pt")
#                 COUNTER = 0

#             else:
#                 COUNTER += 1

#             # Early Stopping
#             wandb.log({
#                 'train_loss': train_loss,
#                 'train_similarity': train_similarity,
#                 'val_loss': val_loss,
#                 'val_similarity': val_similarity, 
#                 'epoch': epoch
#             })

### Base + Denoising

In [21]:
# Base + Denoising + Early Stopping

def train(config=None):
    with wandb.init(config=config):
        #----------------------------------------------------------------------
        RUN_NAME = '2M_gaussian_3'
        SAVE_DIR = './models/clip-vit-large-patch14/fine-tuned'
        BEST_SIMILARITY = 0
        COUNTER = 0
        PATIENCE = 4
        SAVE_MODEL_CKP = True
        SAVE_OPT_CKP = True
        
        config = wandb.config
        EPOCHS = config.epochs
        BATCH_SIZE = config.batch_size
        DENOISING = config.denoising
        KERNEL = config.kernel
        UNFREEZE_START = config.unfreeze_start
        LEARNING_RATE = config.learning_rate
        WEIGHT_DECAY = config.weight_decay
        #----------------------------------------------------------------------
        
        model = load_pretrained_model(UNFREEZE_START)    
        optimizer = optim.AdamW(filter(lambda p: p.requires_grad,
                                       model.parameters()),
                                lr=LEARNING_RATE,
                                weight_decay=WEIGHT_DECAY
                                # fused=True
                                )
        optimizer.zero_grad()
        # if config.optimizer == 'sgd':
        #     optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)
        # elif config.optimizer == 'adam':
        #     optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        criterion = torch.nn.CosineEmbeddingLoss()
        
        random.seed(42)
        shuffled_train_index = random.sample(range(len(train_images)),
                                             k=len(train_images))
        
        num_batchs = math.ceil(len(train_images) / BATCH_SIZE)
        
        # train
        for epoch in range(EPOCHS):
            model.train()
            train_loss = 0
            train_similarity = 0
            
            print(f'epoch: {epoch}')
            for i in tqdm(range(0, len(train_images), BATCH_SIZE)):
                indices = shuffled_train_index[i:i+BATCH_SIZE]

                # input data -image-
                batch_images = []
                for index in indices:
                    image = denoising(train_images[index], 
                                      DENOISING,
                                      KERNEL)
                    image = transform(image)
                    batch_images.append(image)
                batch_images = torch.stack(batch_images)
                batch_images = batch_images.to(device)

                # input data -prompt-
                prompt_list = []
                for index in indices:
                    prompt_list.append(train_prompts[index])

                targets = st_model.encode(prompt_list, 
                                                show_progress_bar=False,
                                                convert_to_tensor=True)
                preds = model(batch_images)

                # back probagation
                labels = torch.ones(targets.size(0)).to(device)
                loss = criterion(preds, targets, labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                similarity = cosine_similarity(
                    preds.detach().cpu().numpy(), 
                    targets.detach().cpu().numpy()
                    )
                train_loss += loss.item()
                train_similarity += similarity
                similarity = 0
            train_loss /= num_batchs
            train_similarity /= num_batchs
            print(f'train loss: {train_loss} ', end='')
            print(f'train similarity: {train_similarity}')
            
            # val
            model.eval()
            val_loss = 0
            val_similarity = 0
            num_batchs = math.ceil(len(val_images) / BATCH_SIZE)
            
            with torch.no_grad():
                for i in tqdm(range(0, len(val_images), BATCH_SIZE)):

                    # input data -image-
                    images = val_images[i:i+BATCH_SIZE]
                    batch_images = []
                    for image in images:
                        image = denoising(image,
                                          DENOISING,
                                          KERNEL)
                        image = transform(image)
                        batch_images.append(image)
                    batch_images = torch.stack(batch_images)
                    batch_images = batch_images.to(device)

                    # input data -prompt-
                    batch_prompts = val_prompts[i:i+BATCH_SIZE]
                    targets = st_model.encode(batch_prompts,
                                              show_progress_bar=False,
                                              convert_to_tensor=True)

                    preds = model(batch_images)
                    
                    labels = torch.ones(targets.size(0)).to(device)
                    loss = criterion(preds, targets, labels)
                    similarity = cosine_similarity(
                        preds.detach().cpu().numpy(), 
                        targets.detach().cpu().numpy()
                    )
                    
                    val_loss += loss.item()
                    val_similarity += similarity
                val_loss /= num_batchs
                val_similarity /= num_batchs
                print(f'val loss: {val_loss} ', end='')
                print(f'val similarity: {val_similarity}')
            
            # SAVE
            if val_similarity > BEST_SIMILARITY:
                os.makedirs(SAVE_DIR, exist_ok=True)
                BEST_SIMILARITY = val_similarity

                if SAVE_MODEL_CKP:
                    torch.save(model.state_dict(),
                               f"{SAVE_DIR}/{RUN_NAME}_model.pt")
                if SAVE_OPT_CKP:
                    torch.save(optimizer.state_dict(),
                               f"{SAVE_DIR}/{RUN_NAME}_opt.pt")
                COUNTER = 0

            else:
                COUNTER += 1

            # early stopping
            wandb.log({
                'train_loss': train_loss,
                'train_similarity': train_similarity,
                'val_loss': val_loss,
                'val_similarity': val_similarity, 
                'epoch': epoch,
                'COUNTER': COUNTER})

### Logger

In [22]:
from datetime import datetime

now = datetime.now()
dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")
print(dt_string)

2023-05-07_01-24-33


In [24]:
# Base + Denoising + Logging

#------------------------------------------------------------------------------
# Run parameters
RUN_NAME = '2M_gaussian_kernel_3'
SAVE_DIR = './models/clip-vit-large-patch14/fine-tuned'
SAVE_MODEL_CKP = True
SAVE_OPT_CKP = True
BEST_SIMILARITY = 0
BATCH_SIZE = 30
SEED = 42

# Early stopping parameters
MIN_DELTA = 0.001
PATIENCE = 3
EPOCHS = 10000

# Model & Optimizer parameters
UNFREEZE_START = 21
LEARNING_RATE = 0.004836150520205524
WEIGHT_DECAY = 0.05458727423867926

# Preprocessing parameters
DENOISING = 'gaussian'
KERNEL = 3

# notes
notes = 'お試し'
#------------------------------------------------------------------------------

import wandb
import random # for demo script

config = {
    'unfreeze_start': UNFREEZE_START,
    'learning_rate': LEARNING_RATE,
    'weight_decay': WEIGHT_DECAY,
    'min_delta': MIN_DELTA,
    'patience': PATIENCE,
    'batch_size': BATCH_SIZE,
    'denoising': DENOISING,
    'kernel': KERNEL,
}

# 1
wandb.login()

# 2
run = wandb.init(
    project="SD_Train_CLIP-ViT-Large_2M",
    config=config,
    notes=notes
)

# 3
# Model & Optimizer & Criterion
model = load_pretrained_model(UNFREEZE_START)    
optimizer = optim.AdamW(filter(lambda p: p.requires_grad,
                               model.parameters()),
                        lr=LEARNING_RATE,
                        weight_decay=WEIGHT_DECAY
                        # fused=True
                        )
optimizer.zero_grad()
# if config.optimizer == 'sgd':
#     optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)
# elif config.optimizer == 'adam':
#     optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

criterion = torch.nn.CosineEmbeddingLoss()

# train index
random.seed(SEED)
shuffled_train_index = random.sample(range(len(train_images)),
                                     k=len(train_images))

# Avaraging parameter
num_batchs = math.ceil(len(train_images) / BATCH_SIZE)

# Early stopping counter
counter = 0

# train
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    train_similarity = 0

    print(f'epoch: {epoch}')
    for i in tqdm(range(0, len(train_images), BATCH_SIZE)):
        indices = shuffled_train_index[i:i+BATCH_SIZE]

        # input data -image-
        batch_images = []
        for index in indices:
            image = denoising(train_images[index], 
                              DENOISING,
                              KERNEL)
            image = transform(image)
            batch_images.append(image)
        batch_images = torch.stack(batch_images)
        batch_images = batch_images.to(device)

        # input data -prompt-
        prompt_list = []
        for index in indices:
            prompt_list.append(train_prompts[index])

        targets = st_model.encode(prompt_list, 
                                        show_progress_bar=False,
                                        convert_to_tensor=True)
        preds = model(batch_images)

        # back probagation
        labels = torch.ones(targets.size(0)).to(device)
        loss = criterion(preds, targets, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        similarity = cosine_similarity(
            preds.detach().cpu().numpy(), 
            targets.detach().cpu().numpy()
            )
        train_loss += loss.item()
        train_similarity += similarity
        similarity = 0
    train_loss /= num_batchs
    train_similarity /= num_batchs
    print(f'train loss: {train_loss} ', end='')
    print(f'train similarity: {train_similarity}')

    # val
    model.eval()
    val_loss = 0
    val_similarity = 0
    num_batchs = math.ceil(len(val_images) / BATCH_SIZE)

    with torch.no_grad():
        for i in tqdm(range(0, len(val_images), BATCH_SIZE)):

            # input data -image-
            images = val_images[i:i+BATCH_SIZE]
            batch_images = []
            for image in images:
                image = denoising(image, DENOISING, KERNEL)
                image = transform(image)
                batch_images.append(image)
            batch_images = torch.stack(batch_images)
            batch_images = batch_images.to(device)

            # input data -prompt-
            batch_prompts = val_prompts[i:i+BATCH_SIZE]
            targets = st_model.encode(batch_prompts,
                                      show_progress_bar=False,
                                      convert_to_tensor=True)

            preds = model(batch_images)

            labels = torch.ones(targets.size(0)).to(device)
            loss = criterion(preds, targets, labels)
            similarity = cosine_similarity(
                preds.detach().cpu().numpy(), 
                targets.detach().cpu().numpy()
            )

            val_loss += loss.item()
            val_similarity += similarity
            similarity = 0
        val_loss /= num_batchs
        val_similarity /= num_batchs
        print(f'val loss: {val_loss} ', end='')
        print(f'val similarity: {val_similarity}')

    # 3
    wandb.log({
        'train_loss': train_loss,
        'train_similarity': train_similarity,
        'val_loss': val_loss,
        'val_similarity': val_similarity, 
        'epoch': epoch
    })
        
    # Save & Early stopping
    delta = (val_similarity - BEST_SIMILARITY)
    print(f'delta: {delta}')
    if delta >= MIN_DELTA:
        BEST_SIMILARITY = val_similarity
        os.makedirs(SAVE_DIR, exist_ok=True)

        if SAVE_MODEL_CKP:
            torch.save(model.state_dict(),
                       f"{SAVE_DIR}/{RUN_NAME}_model.pt")
        if SAVE_OPT_CKP:
            torch.save(optimizer.state_dict(),
                       f"{SAVE_DIR}/{RUN_NAME}_opt.pt")
        counter = 0

    else:
        counter += 1
    
    wandb.alert(
        title="Weights & Biases", 
        text=f"epoch {epoch} のval_similarityは {val_similarity} です。"
    )
    
    if counter > PATIENCE:
        break

run.log_code()

wandb.finish()



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


epoch: 0


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 0.5431994411972689 train similarity: 0.4568005839540049


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.40591378672917683 val similarity: 0.5940862310024072
epoch: 1


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.2654997159639993 train similarity: 3.5931670971778438


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3786221557458242 val similarity: 0.6213778585515397
epoch: 2


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.110370997985204 train similarity: 3.7482957999761917


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3728062249024709 val similarity: 0.6271937896047654
epoch: 3


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0558510292768477 train similarity: 3.80281576757645


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3716648835341136 val similarity: 0.6283351292109697
epoch: 4


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0415396473805107 train similarity: 3.8171271450024356


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.37014477415879565 val similarity: 0.6298552401579619
epoch: 5


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0262021905581156 train similarity: 3.832464602082123


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36824745003382364 val similarity: 0.6317525642964573
epoch: 6


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0243666592439014 train similarity: 3.8343001352479837


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3675593593517939 val similarity: 0.6324406562596417
epoch: 7


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0203074908653895 train similarity: 3.8383592981729295


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36789794143040977 val similarity: 0.6321020734205092
epoch: 8


  0%|          | 0/5144 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Run

In [19]:
wandb.login() 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mataracsia[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
sweep_id = wandb.sweep(sweep_config, project='SD_Train_CLIP-ViT-Large_2M')

Create sweep with ID: yqoig2oz
Sweep URL: https://wandb.ai/ataracsia/SD_Train_CLIP-ViT-Large_2M/sweeps/yqoig2oz


In [None]:
wandb.agent(sweep_id, train, count=1)

[34m[1mwandb[0m: Agent Starting Run: 9aymi1xj with config:
[34m[1mwandb[0m: 	batch_size: 30
[34m[1mwandb[0m: 	denoising: gaussian
[34m[1mwandb[0m: 	epochs: 1000
[34m[1mwandb[0m: 	kernel: 3
[34m[1mwandb[0m: 	learning_rate: 0.004836150520205524
[34m[1mwandb[0m: 	unfreeze_start: 21
[34m[1mwandb[0m: 	weight_decay: 0.05458727423867926
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


epoch: 0


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 0.5447500800282273 train similarity: 0.45524994517060563


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.4070364315509796 val similarity: 0.5929635852376611
epoch: 1


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.2704509987831116 train similarity: 3.588215808037421


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3796297113895416 val similarity: 0.6203703040500826
epoch: 2


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.110797017852465 train similarity: 3.747869781184552


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.373934686700503 val similarity: 0.6260653276214626
epoch: 3


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.061329373717308 train similarity: 3.7973374242934295


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.37185322523117065 val similarity: 0.6281467903271925
epoch: 4


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.041467283209165 train similarity: 3.817199510430032


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3696058535575867 val similarity: 0.6303941619876554
epoch: 5


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.028282539288203 train similarity: 3.8303842506055954


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.369558451573054 val similarity: 0.6304415622005788
epoch: 6


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0186060134967168 train similarity: 3.8400607790845904


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3684655166467031 val similarity: 0.6315344976126177
epoch: 7


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0136236752669014 train similarity: 3.845043118081741


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36783072010676066 val similarity: 0.6321692940091537
epoch: 8


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0087047235568365 train similarity: 3.849962069724182


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36644546512762705 val similarity: 0.6335545488812484
epoch: 9


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0059936989148457 train similarity: 3.852673092709781


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3675257531007131 val similarity: 0.6324742601150822
epoch: 10


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0067986378272376 train similarity: 3.8518681537516617


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36847266781330107 val similarity: 0.6315273462265126
epoch: 11


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0021732905308407 train similarity: 3.8564935023277456


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3672534319162369 val similarity: 0.6327465827035786
epoch: 12


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.004393159588178 train similarity: 3.854273634677038


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.3674714574019114 val similarity: 0.6325285573385132
epoch: 13


  0%|          | 0/5144 [00:00<?, ?it/s]

train loss: 3.0045161658525465 train similarity: 3.854150623219315


  0%|          | 0/750 [00:00<?, ?it/s]

val loss: 0.36805814385414126 val similarity: 0.6319418706640045
epoch: 14


  0%|          | 0/5144 [00:00<?, ?it/s]

候補①：

wandb: Agent Starting Run: 3pz89orr with config:  
wandb: 	batch_size: 20  
wandb: 	epochs: 10  
wandb: 	learning_rate: 0.003942750693801289  
wandb: 	unfreeze_start: 22  
wandb: 	weight_decay: 0.06609377404182681  

Tracking run with wandb version 0.15.0  
Run data is saved locally in C:\Users\emanon\kaggle\Stable Diffusion - Image to Prompts\wandb\run-20230501_195702-3pz89orr  
Syncing run jolly-sweep-5 to Weights & Biases (docs)  
Sweep page: https://wandb.ai/ataracsia/SD_GPT_Method_ViT_large_30K/sweeps/b1jpadmq  
View project at https://wandb.ai/ataracsia/SD_GPT_Method_ViT_large_30K  
View sweep at https://wandb.ai/ataracsia/SD_GPT_Method_ViT_large_30K/sweeps/b1jpadmq  
View run at https://wandb.ai/ataracsia/SD_GPT_Method_ViT_large_30K/runs/3pz89orr  

epoch: 0
train loss: 0.4756058146158854 train similarity: 0.5243941895369026
val loss: 0.5838597163558006 val similarity: 0.41614029235606453

epoch: 1
train loss: 0.990003741979599 train similarity: 1.2599962616064597
val loss: 0.5556601118445397 val similarity: 0.4443398956161023

epoch: 2
train loss: 0.9432114688158035 train similarity: 1.3067885368624688
val loss: 0.5300983499288558 val similarity: 0.46990165720421395

epoch: 3
train loss: 0.9069464582800865 train similarity: 1.3430535478353498
val loss: 0.5118548638820648 val similarity: 0.4881451424020925

epoch: 4
train loss: 0.8801283758282662 train similarity: 1.3698716284237806
val loss: 0.499389152944088 val similarity: 0.5006108502080662

epoch: 5
train loss: 0.8594652481079101 train similarity: 1.3905347537986936
val loss: 0.49192704647779467 val similarity: 0.508072958915308

epoch: 6
train loss: 0.8414201924800873 train similarity: 1.4085798110809176
val loss: 0.48915890061855316 val similarity: 0.510841099741682

epoch: 7
train loss: 0.8236035017967224 train similarity: 1.42639650161527
val loss: 0.4809839709401131 val similarity: 0.5190160317560654

epoch: 8
train loss: 0.8068020737171173 train similarity: 1.4431979278884843
val loss: 0.48496224772930147 val similarity: 0.5150377553672877

epoch: 9
train loss: 0.7893978360891342 train similarity: 1.4606021691188225
val loss: 0.4796355242729187 val similarity: 0.5203644791181199

候補②：

wandb: Agent Starting Run: hiak0yl8 with config:  
wandb: 	batch_size: 20  
wandb: 	epochs: 10  
wandb: 	learning_rate: 0.006691501352424112  
wandb: 	unfreeze_start: 22  
wandb: 	weight_decay: 0.05657430886867784  
Tracking run with wandb version 0.15.0  
Run data is saved locally in C:\Users\emanon\kaggle\Stable Diffusion - Image to Prompts\wandb\run-20230502_013318-hiak0yl8  
Syncing run balmy-sweep-13 to Weights & Biases (docs)  

In [None]:
# # Import the W&B Python Library 
# import wandb

# # 1. Start a W&B Run
# run = wandb.init(
#   project="SD_GPT_Method_ViT_large_30K",
#   notes="My first HP-tuning",
#   # tags=["baseline", "paper1"]
# )