# Evaluation of divmakers models

In this notebook we evaluate the performance of models trained using the divmakers repository - using SimCLR and Divmaker objectives

to use this notebook you need the `divmaker` environment - python 3.9 is required

In [1]:
import os
import torch
from pathlib import Path
import sys
sys.path.append("..")

In [2]:
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm
from pathlib import Path


In [3]:
import os
from osgeo import gdal
import imageio
import numpy as np

from time import time
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [4]:
import os
from copy import deepcopy
import src.systems as systems
from src.utils.utils import load_json
from src.utils.setup import process_config
import random, torch, numpy
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

from pathlib import Path


import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import wandb

torch.backends.cudnn.benchmark = True

SYSTEM = {
    'PretrainSystem': systems.PretrainSystem,
    'PretrainDivMakerSystem': systems.PretrainDivMakerSystem,
    'LinearSystem': systems.LinearSystem,
    'DefaultSystem': systems.DefaultSystem,
    'TransferSystem': systems.TransferSystem,
    'TransferBigEarthNetSystem': systems.TransferBigEarthNetSystem,
    'TransferDefaultSystem': systems.TransferDefaultSystem,
}

In [14]:
def load_from_config(config_str, checkpoint_name=None):
    gpu_device = 0
    
    if gpu_device == 'cpu' or not gpu_device:
        gpu_device = None
    config = process_config(config_str)
    if gpu_device: 
        config.gpu_device = gpu_device
    SystemClass = SYSTEM[config.system]

    system = SystemClass(config)

    if config.optim_params.scheduler:
            lr_callback = globals()[config.optim_params.scheduler](
                initial_lr=config.optim_params.learning_rate,
                max_epochs=config.num_epochs,
                schedule=(
                    int(0.6*config.num_epochs),
                    int(0.8*config.num_epochs),
                ),
            )
            callbacks = [lr_callback]
    else:
        callbacks = []

    ckpt_callback = pl.callbacks.ModelCheckpoint(
        os.path.join(config.exp_dir, 'checkpoints'),
        save_top_k=-1,
        every_n_epochs=1,
    )
    callbacks.append(ckpt_callback)
    
    gpu_device = 1
    
    if gpu_device: config.gpu_device = gpu_device

    callbacks.append(ckpt_callback)
    wandb_logger = WandbLogger(project='image_viewmaker', name=config.exp_name, config=config, sync_tensorboard=True)
    trainer = pl.Trainer(
            default_root_dir=config.exp_dir,
            gpus=gpu_device,
            max_epochs=config.num_epochs,
            min_epochs=config.num_epochs,
            checkpoint_callback=True,
            resume_from_checkpoint=config.continue_from_checkpoint,
            precision=config.optim_params.precision or 32,
            callbacks=callbacks,
            val_check_interval=config.val_check_interval or 1.0,
            limit_val_batches=config.limit_val_batches or 1.0,
            logger=wandb_logger,
        )
    
    if checkpoint_name:
        ckpt_path = Path(config.checkpoint_dir) / checkpoint_name
    else:   
        ckpt_path = Path(config.checkpoint_dir) / config.pretrain_model.checkpoint_name
    
    SystemClass = SYSTEM[config.system]
    
    system = SystemClass.load_from_checkpoint(ckpt_path, config=config)

        
    system.cuda()
    system.eval()
    return system

In [15]:
cuda = torch.cuda.is_available()

def create_embeddings_divmaker(model, 
                               dataset,
                               z_dim=512):
      

    X = np.zeros((len(dataset), z_dim))
    y = np.zeros(len(dataset))
    
    t0 = time()
    # this solution to iterate over examples is very suboptimal, one should use torch dataset
    for index, item in enumerate(tqdm(dataset)):
        # read the tile from provided filepath
        
        
        tile = item[0].unsqueeze(0)
        if cuda: 
            tile = tile.cuda()
            
        
        z = model.model(tile)
        if cuda: 
            z = z.cpu()
        z = z.data.numpy()
        y[index] = item[1]
        X[index,:] = z

    t1 = time()
    print('Embedded {} tiles: {:0.3f}s'.format(len(dataset), t1-t0))
    
    return X, y

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

rf = RandomForestClassifier()

def compare_results(X, y, model, folds = 5, model_name = ""):
    # simple method to evaluate performance of model using cross validation
    if model_name == "":
        model_name = model.__class__.__name__
    
    scores = cross_val_score(rf, X, y, cv=folds)
    print("Averaged accuracy for model {}: {:.2f}±{:.2f}%".format(model_name, scores.mean()*100, scores.std()*100))

In [17]:
from src.datasets import datasets

In [22]:
config_str = "config/eurosat/default_eurosat_simclr_eval.json"

config = process_config(config_str)

system = load_from_config(config_str, checkpoint_name="epoch=199-step=17199.ckpt")

[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.


Loaded configuration: 
{'continue_from_checkpoint': None,
 'copy_checkpoint_freq': 5,
 'cuda': True,
 'data_loader_workers': 16,
 'data_params': {'dataset': 'eurosat',
                 'default_augmentations': 'all',
                 'resize_imagenet_to_32': True},
 'distributed_backend': 'dp',
 'exp_base': '/storage/divmaker/',
 'exp_name': 'pretrain_eurosat_simclr',
 'gpu_device': 0,
 'loss_params': DotMap(t=0.07, name='simclr'),
 'model_params': {'num_res_blocks': 3,
                  'out_dim': 512,
                  'projection_head': False,
                  'resnet': True,
                  'resnet_small': True,
                  'resnet_version': 'resnet18'},
 'num_epochs': 200,
 'optim_params': {'batch_size': 256,
                  'learning_rate': 0.03,
                  'momentum': 0.9,
                  'num_view_update': 1,
                  'patience': 10,
                  'validate_freq': 1,
                  'weight_decay': 0.0001},
 'pretrain_model': {'checkpoint_name

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [23]:
train_dataset, val_dataset = datasets.get_image_datasets(
            config.data_params.dataset,
            default_augmentations=config.data_params.default_augmentations or False,
            resize_imagenet_to_32=config.data_params.resize_imagenet_to_32 or False,
            mask=False,  # Don't mask during transfer.
            zscore=config.data_params.zscore or False,
        )

In [24]:
X, y = create_embeddings_divmaker(system, val_dataset.dataset)


100%|██████████| 5520/5520 [00:20<00:00, 266.90it/s]

Embedded 5520 tiles: 20.686s





In [25]:
compare_results(X, y, rf, folds = 5)

Averaged accuracy for model RandomForestClassifier: 86.49±0.61%


## Divmaker training

In [26]:
config_str = "config/eurosat/pretrain_eurosat_simclr_L1_forced.json"

In [27]:
system = load_from_config(config_str, "epoch=199-step=17199.ckpt")

[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.
[INFO]: Configurations and directories successfully set up.


Loaded configuration: 
{'continue_from_checkpoint': None,
 'copy_checkpoint_freq': 5,
 'cuda': True,
 'data_loader_workers': 8,
 'data_params': {'dataset': 'eurosat',
                 'normalize_before_view': True,
                 'resize_imagenet_to_32': True},
 'distributed_backend': 'dp',
 'exp_base': '/storage/divmaker/',
 'exp_name': 'pretrain_eurosat_simclr_L1_forced_budget_0.2',
 'gpu_device': 0,
 'loss_params': {'objective': 'AdversarialSimCLRLoss',
                 't': 0.07,
                 'view_maker_loss_weight': 1},
 'model_params': {'clamp_views': False,
                  'noise_dim': 100,
                  'num_res_blocks': 3,
                  'out_dim': 128,
                  'projection_head': False,
                  'resnet': True,
                  'resnet_small': True,
                  'resnet_version': 'resnet18',
                  'symmetric_clamp': True,
                  'view_L1_forced': True,
                  'view_bound_magnitude': 0.2,
               

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Set up viewmaker model with bound magnitude: 0.2, divmaker: False
Using AdversarialSimCLRLoss
Set up viewmaker model with bound magnitude: 0.2, divmaker: False


In [28]:
X, y = create_embeddings_divmaker(system, val_dataset.dataset, z_dim=128)


100%|██████████| 5520/5520 [00:19<00:00, 290.50it/s]

Embedded 5520 tiles: 19.005s





In [29]:
compare_results(X, y, rf, folds = 5)

Averaged accuracy for model RandomForestClassifier: 94.24±0.42%
