### Imports and Jupyter setup

In [1]:
%load_ext autoreload
%autoreload 2

import os
import time
import tqdm
import torch
import wandb
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from timm.scheduler import CosineLRScheduler
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score

os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pd.set_option('display.max_columns', None)
device

device(type='cuda')

### Custom Imports

In [2]:
from fgvc.utils.datasets import TrainDataset
from fgvc.utils.augmentations import light_transforms_rcrop
from fgvc.utils.utils import timer, init_logger, seed_everything, getModel

In [3]:
!nvidia-smi

Thu Apr 21 17:12:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
| 20%   31C    P8     8W / 215W |   2879MiB /  7982MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:09:00.0 Off |                  N/A |
| 72%   64C    P2   172W / 340W |   6686MiB / 10014MiB |     89%      Default |
|       

### Load Dataset Metadata

In [4]:
train_metadata = pd.read_csv("../../metadata/PlantCLEF2018_train_metadata.csv")
print(len(train_metadata))

val_metadata = pd.read_csv("../../metadata/PlantCLEF2018_val_metadata.csv")
print(len(val_metadata))

286841
33703


  val_metadata = pd.read_csv("../../metadata/PlantCLEF2018_val_metadata.csv")


In [5]:
train_metadata

Unnamed: 0.1,Unnamed: 0,index,FileName,Species,Origin,Author,Content,Genus,Family,ObservationId,MediaId,YearInCLEF,LearnTag,ClassId,image_path,subset,obs_id,image_url,date,rate,location,latitude,longitude,db,identiplantescore,class_id,family_id,genus_id
0,0,202837,328838.jpg,Sairocarpus coulterianus (Benth.) D.A. Sutton,eol,"2012 don davis, don davis, calphotos",,Sairocarpus,Plantaginaceae,254481,328838,PlantCLEF2017,Train,256342,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,8138,252,2447
1,1,183150,266595.jpg,Malaxis monophyllos (L.) Sw.,eol,biopix,Entire,Malaxis,Orchidaceae,192238,266595,PlantCLEF2017,Train,231652,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,5714,231,1747
2,2,189750,233104.jpg,Helianthus tuberosus L.,eol,"michael wunderli, name it's source",Entire,Helianthus,Asteraceae,158747,233104,PlantCLEF2017,Train,83801,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,4439,34,1336
3,3,217833,262698.jpg,Lupinus sericeus Pursh,eol,"podiceps, inaturalist, inaturalist.org",,Lupinus,Fabaceae,188341,262698,PlantCLEF2017,Train,176158,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,5590,121,1707
4,4,267414,19960.jpg,Fraxinus angustifolia Vahl,tela,pierre bonnet,LeafScan,Fraxinus,Oleaceae,4760,19960,ImageCLEF2011,Train,212691,../../nahouby/Datasets/PlantCLEF/PlantCLEF2015...,,3bd60f74714e9fa767d21352c9e41015,http://ds.plantnet-project.org/couch/plantnet/...,2010-7-15,4.0,Lattes,,,,,3941,226,1176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286836,286836,207539,125081.jpg,Acer negundo L.,eol,"sam kieschnick, inaturalist, inaturalist.org",Leaf,Acer,Aceraceae,50724,125081,PlantCLEF2017,Train,309265,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,133,1,15
286837,286837,81205,186701.jpg,Corchorus schimperi Cufod.,eol,"mark hyde, bart wursten and petra ballings, ba...",Flower,Corchorus,Malvaceae,112344,186701,PlantCLEF2017,Train,192633,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,2510,192,756
286838,286838,112049,257324.jpg,Linaria vulgaris Mill.,eol,"stefano doglio, inaturalist, inaturalist.org",Entire,Linaria,Scrophulariaceae,182967,257324,PlantCLEF2017,Train,255685,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,5422,294,1668
286839,286839,74235,365298.jpg,Trifolium subterraneum L.,eol,biopix,Entire,Trifolium,Fabaceae,290941,365298,PlantCLEF2017,Train,184183,../../../nahouby/Datasets/PlantCLEF/PlantCLEF2...,2017-train,,,,,,,,,,9440,121,2827


In [6]:
train_metadata['image_path'] = train_metadata['image_path'].apply(lambda x: x.replace('../../../nahouby/Datasets/PlantCLEF/', '/Data-10T/PlantCLEF/'))
train_metadata['image_path'] = train_metadata['image_path'].apply(lambda x: x.replace('../../nahouby/Datasets/PlantCLEF/', '/Data-10T/PlantCLEF/'))

val_metadata['image_path'] = val_metadata['image_path'].apply(lambda x: x.replace('../../../nahouby/Datasets/PlantCLEF/', '/Data-10T/PlantCLEF/'))
val_metadata['image_path'] = val_metadata['image_path'].apply(lambda x: x.replace('../../nahouby/Datasets/PlantCLEF/', '/Data-10T/PlantCLEF/'))

### Training Parameters

In [9]:
# Adjust BATCH_SIZE and ACCUMULATION_STEPS to values that if multiplied results in 64 !!!!!1

config = {"augmentations": 'light-random_crop',
           "optimizer": 'SGD',
           "scheduler": 'cyclic_cosine',
           "image_size": (224, 224),
           "random_seed": 777,
           "number_of_classes": len(train_metadata['class_id'].unique()),
           "architecture": 'vit_base_patch32_224',
           "batch_size": 32,
           "accumulation_steps": 4,
           "epochs": 100,
           "learning_rate": 0.01,
           "dataset": 'PlantCLEF2018',
           "loss": 'CrossEntropyLoss',
           "training_samples": len(train_metadata),
           "valid_samples": len(val_metadata),
           "workers": 12,
           }

RUN_NAME = f"{config['architecture']}-{config['optimizer']}-{config['scheduler']}-{config['augmentations']}"

### Fix Seeds & Log Setup

In [10]:
LOG_FILE = f'{RUN_NAME}.log'
LOGGER = init_logger(LOG_FILE)

seed_everything(config['random_seed'])

### Init Model

In [11]:
# %%
model = getModel(config['architecture'], config['number_of_classes'], pretrained=True)
model_mean = list(model.default_cfg['mean'])
model_std = list(model.default_cfg['std'])

In [12]:
# Adjust BATCH_SIZE and ACCUMULATION_STEPS to values that if multiplied results in 64 !!!!!1

if config['augmentations'] == 'light':
    train_augmentations = light_transforms(data='train', image_size=config['image_size'], mean=model_mean, std=model_std)
    val_augmentations = light_transforms(data='valid', image_size=config['image_size'], mean=model_mean, std=model_std)
elif config['augmentations'] == 'light-random_crop':
    train_augmentations = light_transforms_rcrop(data='train', image_size=config['image_size'], mean=model_mean, std=model_std)
    val_augmentations = light_transforms_rcrop(data='valid', image_size=config['image_size'], mean=model_mean, std=model_std)    
    
print('Augmentations:', config['augmentations'])

train_dataset = TrainDataset(train_metadata, transform=train_augmentations)
valid_dataset = TrainDataset(val_metadata, transform=val_augmentations)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=config['workers'])
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['workers'])

Augmentations: light-random_crop


### Init WandB

In [13]:
from fgvc.utils.wandb import init_wandb

init_wandb(config, RUN_NAME, entity='picekl', project='frontiers-plant-recognition')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpicekl[0m (use `wandb login --relogin` to force relogin)


### Set Optimizers!

In [14]:
if config['optimizer'] == 'AdamW':
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
elif config['optimizer'] == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)

if config['scheduler'] =='plateau':
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=1, verbose=True, eps=1e-6)
elif config['scheduler'] == 'cyclic_cosine':
    CYCLES = 5
    t_initial = config['epochs'] / CYCLES
    scheduler = CosineLRScheduler(optimizer, t_initial=20, lr_min=0.0001, cycle_decay = 0.9, cycle_limit = 5)

### Training Loop

In [None]:
with timer('Train model', LOGGER):
    
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(config['epochs']):
        
        start_time = time.time()

        model.train()
        avg_loss = 0.
        
        num_steps_per_epoch = len(train_loader)
        num_updates = epoch * num_steps_per_epoch

        optimizer.zero_grad()
        
        train_lbls = np.zeros((len(train_metadata)))
        train_preds = np.zeros((len(train_metadata)))

        for i, (images, labels, _) in tqdm.tqdm(enumerate(train_loader)):

            images = images.to(device)
            labels = labels.to(device)

            y_preds = model(images)
            loss = criterion(y_preds, labels)
            
            # Scale the loss to the mean of the accumulated batch size
            avg_loss += loss.item() / len(train_loader) 
            loss = loss / config['accumulation_steps']
            loss.backward()
            if (i - 1) % config['accumulation_steps'] == 0:
                optimizer.step()
                optimizer.zero_grad()
                
            if config['scheduler'] == 'cyclic_cosine':
                num_updates += 1
                scheduler.step_update(num_updates=num_updates)
                
                
            train_preds[i * len(labels): (i+1) * len(labels)] = y_preds.argmax(1).to('cpu').numpy()
            train_lbls[i * len(labels): (i+1) * len(labels)] = labels.to('cpu').numpy()
            
        model.eval()
        avg_val_loss = 0.
        preds = np.zeros((len(valid_dataset)))
        preds_raw = []

        for i, (images, labels, _) in enumerate(valid_loader):
            
            images = images.to(device)
            labels = labels.to(device)
            
            with torch.no_grad():
                y_preds = model(images)
            
            preds[i * len(images): (i+1) * len(images)] = y_preds.argmax(1).to('cpu').numpy()
            preds_raw.extend(y_preds.to('cpu').numpy())

            loss = criterion(y_preds, labels)
        
            avg_val_loss += loss.item() / len(valid_loader)
        
        
        if config['scheduler'] == 'plateau':
            scheduler.step(avg_val_loss)
        elif config['scheduler'] == 'cyclic_cosine':
            scheduler.step(epoch + 1)
        
        train_accuracy = accuracy_score(train_lbls, train_preds)
        train_f1 = f1_score(train_lbls, train_preds, average='macro')
        
        accuracy = accuracy_score(val_metadata['class_id'], preds)
        f1 = f1_score(val_metadata['class_id'], preds, average='macro')
        recall_3 = top_k_accuracy_score(val_metadata['class_id'], preds_raw, k=3)

        elapsed = time.time() - start_time
        
        LOGGER.debug(f'  Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} F1: {f1*100:.2f}  Acc: {accuracy*100:.2f} Recall@3: {recall_3*100:.2f} time: {elapsed:.0f}s')
       
        wandb.log({'Train_loss (avr.)': avg_loss,
                   'Val. loss (avr.)': avg_val_loss,
                   'Val. F1': np.round(f1*100, 2),
                   'Val. Accuracy': np.round(accuracy*100, 2),
                   'Val. Recall@3': np.round(recall_3*100, 2),
                   'Learning Rate': optimizer.param_groups[0]["lr"],
                   'Train. Accuracy': np.round(train_accuracy*100, 2),
                   'Train. F1': np.round(train_f1*100, 2)})

        if accuracy>best_score:
            best_score = accuracy
            LOGGER.debug(f'  Epoch {epoch+1} - Save Best Accuracy: {best_score:.6f} Model')
            torch.save(model.state_dict(), f'{RUN_NAME}_best_accuracy.pth')

        if avg_val_loss<best_loss:
            best_loss = avg_val_loss
            LOGGER.debug(f'  Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save(model.state_dict(), f'{RUN_NAME}_best_loss.pth')

2022-04-21 17:12:44,934 INFO [Train model] start
8964it [39:30,  3.78it/s]
2022-04-21 17:57:19,499 DEBUG   Epoch 1 - avg_train_loss: 4.9534  avg_val_loss: 3.8310 F1: 10.87  Acc: 36.69 Recall@3: 49.63 time: 2673s
2022-04-21 17:57:19,500 DEBUG   Epoch 1 - Save Best Accuracy: 0.366881 Model
2022-04-21 17:57:20,114 DEBUG   Epoch 1 - Save Best Loss: 3.8310 Model
948it [02:24,  6.58it/s]

In [None]:
torch.save(model.state_dict(), f'{RUN_NAME}-100E.pth')