### Imports and Jupyter setup

In [1]:
%load_ext autoreload
%autoreload 2

import os
import time
import tqdm
import torch
import wandb
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from timm.scheduler import CosineLRScheduler
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score

os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pd.set_option('display.max_columns', None)
device

device(type='cuda')

### Custom Imports

In [2]:
from fgvc.utils.datasets import TrainDataset
from fgvc.utils.augmentations import light_transforms, light_transforms_rcrop
from fgvc.utils.utils import timer, init_logger, seed_everything, getModel

In [3]:
!nvidia-smi

Sun Apr 24 17:13:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 465.27       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 61%   50C    P2   121W / 350W |  18755MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:C1:00.0 Off |                  N/A |
| 57%   48C    P2   115W / 350W |  17629MiB / 24268MiB |      0%      Default |
|       

### Load Dataset Metadata

In [4]:
train_metadata = pd.read_csv("../../metadata/PlantCLEF2018_train_metadata.csv")
print(len(train_metadata))

val_metadata = pd.read_csv("../../metadata/PlantCLEF2018_val_metadata.csv")
print(len(val_metadata))

web_metadata = pd.read_csv("../../metadata/PlantCLEF2018_web_metadata.csv")
print(len(web_metadata))

286841
33703


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


286841


In [5]:
train_metadata['image_path'] = train_metadata['image_path'].apply(lambda x: x.replace('../../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))
train_metadata['image_path'] = train_metadata['image_path'].apply(lambda x: x.replace('../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))

val_metadata['image_path'] = val_metadata['image_path'].apply(lambda x: x.replace('../../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))
val_metadata['image_path'] = val_metadata['image_path'].apply(lambda x: x.replace('../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))

web_metadata['image_path'] = web_metadata['image_path'].apply(lambda x: x.replace('../../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))
web_metadata['image_path'] = web_metadata['image_path'].apply(lambda x: x.replace('../../nahouby/Datasets/PlantCLEF/', '/local/nahouby/Datasets/PlantCLEF/'))

In [6]:
label_freq = train_metadata['ClassId'].value_counts().sort_values(ascending=False)

sample_size = 40
sampled_images = []

for class_id, count in label_freq[label_freq < sample_size].iteritems():
    size = sample_size - count
    
    class_df = web_metadata[web_metadata['ClassId'] == class_id]
        
    if len(class_df) > size:
        sample = class_df.sample(size, random_state=777)
    else:
        sample = class_df.sample(len(class_df), random_state=777)
    
    sampled_images.append(sample)

In [7]:
tmp = pd.concat(sampled_images).reset_index().drop(columns=['level_0', 'Unnamed: 0', 'index'])
train_metadata = pd.concat([train_metadata, tmp]).drop(columns=['Unnamed: 0', 'index'])

f'Number of training images: {len(train_metadata)}'

'Number of training images: 362068'

### Training Parameters

In [8]:
# Adjust BATCH_SIZE and ACCUMULATION_STEPS to values that if multiplied results in 64 !!!!!1

config = {"augmentations": 'light',
           "optimizer": 'SGD',
           "scheduler": 'cyclic_cosine',
           "image_size": (224, 224),
           "random_seed": 777,
           "number_of_classes": len(train_metadata['class_id'].unique()),
           "architecture": 'vit_base_patch32_224',
           "batch_size": 32,
           "accumulation_steps": 4,
           "epochs": 100,
           "learning_rate": 0.01,
           "dataset": 'PlantCLEF2018',
           "loss": 'CrossEntropyLoss',
           "training_samples": len(train_metadata),
           "valid_samples": len(val_metadata),
           "workers": 12,
           "upsampled": sample_size,
           }

RUN_NAME = f"{config['architecture']}-{config['optimizer']}-{config['scheduler']}-{config['augmentations']}-{config['upsampled']}"

### Fix Seeds & Log Setup

In [9]:
LOG_FILE = f'{RUN_NAME}.log'
LOGGER = init_logger(LOG_FILE)

seed_everything(config['random_seed'])

### Init Model

In [10]:
# %%
model = getModel(config['architecture'], config['number_of_classes'], pretrained=True)
model_mean = list(model.default_cfg['mean'])
model_std = list(model.default_cfg['std'])

In [11]:
# Adjust BATCH_SIZE and ACCUMULATION_STEPS to values that if multiplied results in 64 !!!!!1

if config['augmentations'] == 'light':
    train_augmentations = light_transforms(data='train', image_size=config['image_size'], mean=model_mean, std=model_std)
    val_augmentations = light_transforms(data='valid', image_size=config['image_size'], mean=model_mean, std=model_std)
elif config['augmentations'] == 'light_rcrop':
    train_augmentations = light_transforms_rcrop(data='train', image_size=config['image_size'], mean=model_mean, std=model_std)
    val_augmentations = light_transforms_rcrop(data='valid', image_size=config['image_size'], mean=model_mean, std=model_std)

train_dataset = TrainDataset(train_metadata, transform=train_augmentations)
valid_dataset = TrainDataset(val_metadata, transform=val_augmentations)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=config['workers'])
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['workers'])

### Init WandB

In [12]:
from fgvc.utils.wandb import init_wandb

init_wandb(config, RUN_NAME, entity='picekl', project='frontiers-plant-recognition')

[34m[1mwandb[0m: Currently logged in as: [33mpicekl[0m (use `wandb login --relogin` to force relogin)


### Set Optimizers!

In [13]:
if config['optimizer'] == 'AdamW':
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
elif config['optimizer'] == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)

if config['scheduler'] =='plateau':
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=1, verbose=True, eps=1e-6)
elif config['scheduler'] == 'cyclic_cosine':
    CYCLES = 5
    t_initial = config['epochs'] / CYCLES
    scheduler = CosineLRScheduler(optimizer, t_initial=20, lr_min=0.0001, cycle_decay = 0.9, cycle_limit = 5)

### Training Loop

In [14]:
with timer('Train model', LOGGER):
    
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(config['epochs']):
        
        start_time = time.time()

        model.train()
        avg_loss = 0.
        
        num_steps_per_epoch = len(train_loader)
        num_updates = epoch * num_steps_per_epoch

        optimizer.zero_grad()
        
        train_lbls = np.zeros((len(train_metadata)))
        train_preds = np.zeros((len(train_metadata)))

        for i, (images, labels, _) in tqdm.tqdm(enumerate(train_loader)):

            images = images.to(device)
            labels = labels.to(device)

            y_preds = model(images)
            loss = criterion(y_preds, labels)
            
            # Scale the loss to the mean of the accumulated batch size
            avg_loss += loss.item() / len(train_loader) 
            loss = loss / config['accumulation_steps']
            loss.backward()
            if (i - 1) % config['accumulation_steps'] == 0:
                optimizer.step()
                optimizer.zero_grad()
                
            if config['scheduler'] == 'cyclic_cosine':
                num_updates += 1
                scheduler.step_update(num_updates=num_updates)
                
                
            train_preds[i * len(labels): (i+1) * len(labels)] = y_preds.argmax(1).to('cpu').numpy()
            train_lbls[i * len(labels): (i+1) * len(labels)] = labels.to('cpu').numpy()
            
        model.eval()
        avg_val_loss = 0.
        preds = np.zeros((len(valid_dataset)))
        preds_raw = []

        for i, (images, labels, _) in enumerate(valid_loader):
            
            images = images.to(device)
            labels = labels.to(device)
            
            with torch.no_grad():
                y_preds = model(images)
            
            preds[i * len(images): (i+1) * len(images)] = y_preds.argmax(1).to('cpu').numpy()
            preds_raw.extend(y_preds.to('cpu').numpy())

            loss = criterion(y_preds, labels)
        
            avg_val_loss += loss.item() / len(valid_loader)
        
        
        if config['scheduler'] == 'plateau':
            scheduler.step(avg_val_loss)
        elif config['scheduler'] == 'cyclic_cosine':
            scheduler.step(epoch + 1)
        
        train_accuracy = accuracy_score(train_lbls, train_preds)
        train_f1 = f1_score(train_lbls, train_preds, average='macro')
        
        accuracy = accuracy_score(val_metadata['class_id'], preds)
        f1 = f1_score(val_metadata['class_id'], preds, average='macro')
        recall_3 = top_k_accuracy_score(val_metadata['class_id'], preds_raw, k=3)

        elapsed = time.time() - start_time
        
        LOGGER.debug(f'  Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} F1: {f1*100:.2f}  Acc: {accuracy*100:.2f} Recall@3: {recall_3*100:.2f} time: {elapsed:.0f}s')
       
        wandb.log({'Train_loss (avr.)': avg_loss,
                   'Val. loss (avr.)': avg_val_loss,
                   'Val. F1': np.round(f1*100, 2),
                   'Val. Accuracy': np.round(accuracy*100, 2),
                   'Val. Recall@3': np.round(recall_3*100, 2),
                   'Learning Rate': optimizer.param_groups[0]["lr"],
                   'Train. Accuracy': np.round(train_accuracy*100, 2),
                   'Train. F1': np.round(train_f1*100, 2),})

        if accuracy>best_score:
            best_score = accuracy
            LOGGER.debug(f'  Epoch {epoch+1} - Save Best Accuracy: {best_score:.6f} Model')
            torch.save(model.state_dict(), f'{RUN_NAME}_best_accuracy.pth')

        if avg_val_loss<best_loss:
            best_loss = avg_val_loss
            LOGGER.debug(f'  Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save(model.state_dict(), f'{RUN_NAME}_best_loss.pth')

2022-04-24 17:13:38,747 INFO [Train model] start


RuntimeError: CUDA error: out of memory

In [None]:
torch.save(model.state_dict(), f'{RUN_NAME}-100E.pth')