# Обучении EfficientFormer с подбором гиперпараметров
В данной тетрадке расположен код обучения и инференса EfficinetFormer модели. Перед использованием кода нужно удостовериться, что все нужные библиотеки установлены и указан ваш wandb_api_key в СFG.wandb_key. Для инференса рекомендуется использовать модель с наименьшим RMSE.

## Optune HyperbandPruner
Оптимизация гиперпараметров по принципу решения нестахостической задачи многоруких бандитов
## Optune TPESampler
Tree-structured Parzen Estimator. Также стоит отметить, что для TPESampler лучший pruner - это Hyperband

## Вывод:
Связка AdamW и Косинусного шедулера показала себя лучше всего

In [2]:
from IPython.display import clear_output

#!pip install lightning timm opendatasets albumentations catboost gdown open_clip_torch wandb pytorch-optimizer optuna
#clear_output()

In [3]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import cv2
import torch
from optuna.integration import WeightsAndBiasesCallback
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from math import sin,cos,pi,floor
import json
from torch.optim import AdamW,SGD
from pytorch_optimizer import Lion,QHAdam
from sklearn.metrics import accuracy_score,f1_score,balanced_accuracy_score
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error
from torch.optim.lr_scheduler import LambdaLR,ExponentialLR
import albumentations as A
import open_clip
from transformers import get_cosine_schedule_with_warmup
from albumentations.pytorch.transforms import ToTensorV2
from catboost import CatBoostClassifier,Pool,cv
from copy import deepcopy
import wandb
import optuna
import timm
pl.seed_everything(56)

Global seed set to 56


56

In [5]:
class CFG:
    class data:
        train_data= './aiijc23-4/train_scores.csv'
        test_data = './simple_sub.csv'
        train_path='./aiijc23-4/train/train/'
        test_path = './aiijc23-4/test/test/'
        num_workers = 4
        val_split_size = 0.2
        batch_size = 32
        seed = 56
    class model:
        model ='efficientformer_l7'
        pretrained = True
        num_labels = 1
        seed=56

# Оптимизаторы
## SGD
Stochastic gradient descent. Несмотря на то, что SGD существует в сообществе машинного обучения уже давно, он часто используется в контексте крупномасштабного обучения. 
## AdamW
Adaptive momentum + weight decay. Один из самых эффективных алгоритмов оптимизации в обучении нейронных сетей. Он сочетает в себе идеи RMSProp и накопление импульса. 
## Lion
EvoLved Sign Momentum. Он более эффективно работает с памятью, так как отслеживает только импульс, обновление которого рассчитывается только с помощью знака
## qhAdam
Quasi-Hyperbolic Momentum Algorithm. Продвинутая версия Adam'a, в которой оценки импульсы заменены квазигиперболическими членами.

In [6]:
def create_optimizer(cfg,parameters):
    if cfg['optim'] == 'sgd':
        return SGD(parameters,
                   lr=cfg['lr'],
                   weight_decay=cfg['weights_decay'])
    
    if cfg['optim'] == 'adamw':
        return AdamW(parameters,
                   lr=cfg['lr'],
                   weight_decay=cfg['weights_decay'])
    
    if cfg['optim'] == 'lion':
        return Lion(parameters,
                   lr=cfg['lr'],
                   weight_decay=cfg['weights_decay'])
    
    if cfg['optim'] == 'qhadam':
        return QHAdam(parameters,
                   lr=cfg['lr'],
                   weight_decay=cfg['weights_decay'])

# Schedulers
## LambdaLR
Learning rate - константа
## ExponentialLR
<div>
<img src="images/ExponentialLR.png" width="400"/>
</div>

## Cosine step
<div>
<img src="images/Warmup Cosine Schedule.png" width="600"/>
</div>

In [7]:
def create_scheduler(cfg,optim):
    
    if cfg['scheduler'] == 'constant':
        f = lambda x:x
        sheduler =  LambdaLR(optim,lr_lambda=f)
    
    if cfg['scheduler'] == 'exp':
        sheduler = ExponentialLR(optim,
                             gamma=cfg['gamma'])
    
    if cfg['scheduler'] == 'cosine_step':
        sheduler =  get_cosine_schedule_with_warmup(optim,
                             num_training_steps=cfg['train_steps'],
                             num_warmup_steps=cfg['num_warmup_steps'],
                             num_cycles = cfg['num_cycles'],
                             )
        
    if cfg['scheduler'] == 'constant' or cfg['scheduler'] == 'exp':
        return {'scheduler':sheduler,
                'interval':'epoch',
                'frequency':1}
    
    if cfg['scheduler'] == 'cosine_step':
        return {'scheduler':sheduler,
                'interval':'step',
                'frequency':1}
        

# Предобработка датасета

In [None]:
class Processor():
    def __init__(self):
        self.transforms = A.Compose([
                                     A.Resize(224,224),
                                     A.Normalize(),
                                     ToTensorV2()
                                    ])
    def __call__(self,img):
        return self.transforms(image=img)['image']

In [9]:
def make_df(path,root_path=CFG.data.train_path):
    data = pd.read_csv(path)
    df = pd.DataFrame()
    df['image'] = data['IMAGE'].apply(lambda x:root_path + x)
    df['label'] = data['SCORE']
    return df

In [10]:
class PLDataset(Dataset):
    def __init__(self, df,preprocess):
        super().__init__()
        self.cfg = CFG.data
        self.data = df[['image','label']].values
        self.preprocess = preprocess
    def __getitem__(self, index):
        #image = Image.open(self.data[index][0]).convert('RGB')
        image = cv2.imread(self.data[index][0])
        image = self.preprocess(image)
        label = self.data[index][1]
        return image,label
    def __len__(self):
        return len(self.data)

In [11]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,preprocess):
        super().__init__()
        self.cfg = CFG.data
        self.train_dataset_path = self.cfg.train_data
        self.test_dataset_path = self.cfg.test_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
        self.preprocess = preprocess
    def prepare_data(self):
        self.train_df = make_df(self.train_dataset_path)
        self.test_df = make_df(CFG.data.test_data,
                               root_path=CFG.data.test_path)
        
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size,random_state=self.cfg.seed)
        self.train_dataset = PLDataset(self.train_df,self.preprocess)
        self.val_dataset = PLDataset(self.val_df,self.preprocess)
        self.test_dataset = PLDataset(self.test_df,self.preprocess)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.batch_size,
                         num_workers=self.num_workers,
                         shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers)
    
    def predict_dataloader(self):
        return DataLoader(self.test_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers)

## Метрики и лосс
- были использованы различные лосс функции для создания новых гипотез
- была выбрана лосс функция RMSE, потому что она показывала себя лучше всего на регрессионных метриках
$$\sqrt{\frac{\sum_{i=1}^N(pred_i-act_i)^2}{N}}$$

In [13]:
class AverageMeter():
    
    def __init__(self):
        self.labels = []
        self.preds = []
        
    def reset(self):
        self.labels = []
        self.preds = []
        
    def update(self,labels,preds):
        self.labels += labels
        self.preds += preds
    
    def calc_metrics(self):
        labels = pd.Series(self.labels)
        preds = pd.Series(self.preds)
        preds_bin = preds.map(round)
        metrics = dict()
        
        metrics['val_rmse'] = (mean_squared_error(labels,preds)) ** 0.5
        metrics['val_mae'] = mean_absolute_error(labels,preds)
        metrics['mape']= mean_absolute_percentage_error(labels,preds)
        metrics['val_r2'] = r2_score(labels,preds)
        
        metrics['val_f1'] = f1_score(labels,preds_bin,average='macro')
        metrics['val_acc'] = accuracy_score(labels,preds_bin)
        metrics['val_w_acc'] = balanced_accuracy_score(labels,preds_bin)
        return metrics

In [1]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss

NameError: name 'nn' is not defined

# Инициализация модели и алгоритм обучения

In [14]:
class PLModule(pl.LightningModule):
    def __init__(self,model,train_cfg,optim_cfg,schedul_cfg):
        super().__init__()
        self.optim_cfg = optim_cfg
        self.schedule_cfg = schedul_cfg
        self.train_cfg = train_cfg
        self.model = model
        self.criterion = RMSELoss()
        self.last_loss = 0
        self.history = []
        self.avg_meter = AverageMeter()
        
    def forward(self,x):
        features = self.model(x)
        return torch.squeeze(features)

    def training_step(self, batch, i):
        x,targets = batch
        x,targets = x.float(),targets.float()
        logits = self(x)
        loss = self.criterion(logits, targets)
        self.log_dict({'train_loss':loss.item()})
        self.last_loss = loss.item()
        return loss
    
    def predict_step(self, batch, i):
        x,targets = batch
        x,targets = x.float(),targets.float()
        logits = self(x)
        return logits
        
    def validation_step(self, batch, _):
        x,targets = batch
        x,targets = x.float(),targets.float()
        logits = self(x)
        loss = self.criterion(logits,targets)
        self.log_dict({'val_loss':loss.item()})
        self.avg_meter.update(targets.cpu().detach().tolist(),
                              logits.cpu().detach().tolist())
        self.last_loss = loss.item()
    
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.history += [metrics['val_rmse']]
        self.avg_meter.reset()
            
    def configure_optimizers(self):
        optim = create_optimizer(self.optim_cfg,self.parameters())
        
        if self.schedule_cfg['scheduler'] == 'cosine_step':
            self.schedule_cfg['train_steps'] = TRAIN_STEPS * self.train_cfg['max_epochs']
            self.schedule_cfg['num_warmup_steps'] = floor(self.schedule_cfg['train_steps'] * self.schedule_cfg['warmup_ratio'])
            
        scheduler = create_scheduler(self.schedule_cfg,optim)
        return [optim], [scheduler]

In [15]:
def get_trainer(max_epoches):
    return pl.Trainer(
                accelerator="gpu",
                precision=32,
                min_epochs=1,
                devices=[0],
                check_val_every_n_epoch=1,
                max_epochs=max_epoches)

In [16]:
def train_model(train_cfg,optim_cfg,schedul_cfg):
    trainer = get_trainer(train_cfg['max_epochs'])
    model = timm.create_model(CFG.model.model,
                              pretrained=CFG.model.pretrained,
                              num_classes=1)
    pl_model = PLModule(model,train_cfg,optim_cfg,schedul_cfg)
    trainer.fit(pl_model,datamodule=dm)
    return pl_model

## Перебор других гиперпараметров
- количество эпох: от 1 до 6
- learning rate: от 1e-5 до 1e-3
- регуляризация: от 0 до 1e-2
- количество периодов шедулера: от 0.2 до 0.8
- период warmup фазы: от 0.01 до 0.1

In [17]:
def objective(trial):
    train_cfg = {'max_epochs':trial.suggest_int('max_epochs',1,6)}
    optim_cfg = {'optim':trial.suggest_categorical('optim',['sgd','adamw','lion','qhadam']),
                 'lr':trial.suggest_float('lr',1e-5,1e-3),
                 'weights_decay':trial.suggest_float('weights_decay',0.,1e-2)}
    
    scheduler_type = trial.suggest_categorical('scheduler',['constant','exp','cosine_step'])
    if scheduler_type == 'exp':
        scheduler_cfg = {'scheduler':scheduler_type,
                         'gamma':trial.suggest_float('gamma',0.1,0.9)}
        
    elif scheduler_type == 'constant':
        scheduler_cfg = {'scheduler':scheduler_type}
        
    elif scheduler_type == 'cosine_step':
        scheduler_cfg = {'scheduler':scheduler_type,
                         'warmup_ratio':trial.suggest_float('warmup_ratio',0.01,0.1),
                         'num_cycles':trial.suggest_float('num_cycles',0.2,0.8)}
    
    all_config = train_cfg | optim_cfg |scheduler_cfg
    
    model = train_model(train_cfg,optim_cfg,scheduler_cfg)
    rmse = model.history[-1]
    all_config['rmse'] = rmse
    print(all_config)
    with open('chekpoint.json','w') as f :
        json.dump(all_config,f)
    return rmse

In [18]:
processor = Processor()
dm = PLDataModule(processor)
dm.prepare_data()
dm.setup(0)
TRAIN_STEPS = len(dm.train_dataloader())

In [19]:
wandb_kwargs = {"project": "aiijc_optuna"}
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [21]:
study = optuna.create_study(
        direction="minimize",
        study_name='expirement',
        load_if_exists=True,
        pruner=optuna.pruners.HyperbandPruner(),
        sampler=optuna.samplers.TPESampler()
    )

[I 2023-08-25 14:22:42,590] A new study created in memory with name: expirement


In [None]:
study.optimize(objective,
               n_trials=50,
               callbacks=[wandbc])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type     | Params
---------------------------------------
0 | model     | ResNet   | 21.3 M
1 | criterion | RMSELoss | 0     
---------------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.141    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
[I 2023-08-25 15:04:36,595] Trial 2 finished with value: 0.5918921158199643 and parameters: {'max_epochs': 3, 'optim': 'adamw', 'lr': 0.0008295702298376161, 'weights_decay': 0.0005741908144913799, 'scheduler': 'exp', 'gamma': 0.8708595106724532}. Best is trial 1 with value: 0.577831168811426.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


{'max_epochs': 3, 'optim': 'adamw', 'lr': 0.0008295702298376161, 'weights_decay': 0.0005741908144913799, 'scheduler': 'exp', 'gamma': 0.8708595106724532, 'rmse': 0.5918921158199643}


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type     | Params
---------------------------------------
0 | model     | ResNet   | 21.3 M
1 | criterion | RMSELoss | 0     
---------------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.141    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
[I 2023-08-25 15:25:34,309] Trial 3 finished with value: 0.7083189116388795 and parameters: {'max_epochs': 5, 'optim': 'lion', 'lr': 0.0004706826396598755, 'weights_decay': 0.0071893048394932185, 'scheduler': 'constant'}. Best is trial 1 with value: 0.577831168811426.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


{'max_epochs': 5, 'optim': 'lion', 'lr': 0.0004706826396598755, 'weights_decay': 0.0071893048394932185, 'scheduler': 'constant', 'rmse': 0.7083189116388795}


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type     | Params
---------------------------------------
0 | model     | ResNet   | 21.3 M
1 | criterion | RMSELoss | 0     
---------------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.141    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

