# Ensembling

TODO: elaborate

## Google Colab

The first cell will only be run in Google Colab, the second one locally as well.

In [1]:
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    # noinspection PyUnresolvedReferences
    from google.colab import drive
    drive.mount('/content/drive')

In [2]:
import os
import glob

# let's keep this cell at the beginning for every notebook
# for more convenient training in Google Colab
def get_root_path(filename: str) -> str: 
    """Get root path based on notebook's name."""
    filepath = glob.glob(os.getcwd() + '/**/' + filename, recursive = True)[0]
    return os.path.dirname(os.path.dirname(filepath))

ROOT_PATH = get_root_path('ensembling.ipynb')
sys.path.append(ROOT_PATH)

# go to the drive directory
os.chdir(ROOT_PATH) if IN_COLAB else None

## Imports

In [3]:
import os
import cv2
import torch
import itertools

import albumentations as A
import pandas as pd
import segmentation_models_pytorch as smp

from matplotlib import colors
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from distutils.spawn import find_executable
from scripts.evaluation import EvaluationMonitor, get_best_f1_per_setup, PredictionMonitor
from scripts.preprocessing import RoadDataset, split_data
from scripts.training import train_model, setup_seed, valid_epoch, train_epoch
from scripts.plotting import plot_post_processing, plot_images
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.optim.lr_scheduler import CosineAnnealingLR

  from distutils.spawn import find_executable


In [4]:
# necessary for downloading some of the models
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
setup_seed(16)

## Data

In [6]:
# specify train directory
train_directory = os.path.join(ROOT_PATH, 'data', 'raw', 'train')

In [7]:
# image paths so that all the images are used for train dataset (no test set for cv due to small training set)
image_path_train, _, mask_path_train, _ = split_data(train_directory, test_size=0)

# create train Dataset without transformations for now
train_dataset = RoadDataset(image_path_train, mask_path_train)

In [8]:
# define transformations
train_tf = A.Compose([
    A.Resize(height=608, width=608, always_apply=True),
    A.Rotate(p=0.5, limit=180, border_mode=cv2.BORDER_CONSTANT, rotate_method="ellipse"),
    A.RandomBrightnessContrast(p=0.5)
])

valid_tf = A.Compose([A.Resize(height=608, width=608, always_apply=True)])

## Hyperparameters

Since our aim is to see, how different architectures influence the training, we fix the model, epochs and batch sizes.

In [9]:
ENCODER_WEIGHTS = 'imagenet'

SEED = 16
BATCH_SIZE = 4
K_FOLD = 3
N_CPU = os.cpu_count()
N_EPOCHS = 2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LOADER_PARAMS = {
    'batch_size': BATCH_SIZE
}

## Encode-Decoder Combinations

In [10]:
# specify the root path for evaluation json-s
encoder_decoder = [
    #('resnet34', 'Unet'),
    #('resnet18', 'Unet'),
    ('resnet18', 'Unet'),
    #('resnet101', 'DeepLabV3'), 
    #('vgg19', 'UnetPlusPlus'), 
    #('inceptionv4', 'UnetPlusPlus'), 
    #('mit_b2', 'Unet'), 
    #('efficientnet-b4', 'UnetPlusPlus')
]

## Cross-Validation

In [11]:
eval_monitor = EvaluationMonitor(os.path.join(ROOT_PATH, 'data', 'results', 'ensembling'))

In [12]:
from flaml import AutoML

k_fold = KFold(n_splits=K_FOLD, shuffle=True, random_state=SEED)

# Get training and validation indices
for fold, (train_idx, val_idx) in enumerate(k_fold.split(train_dataset)):
    
    # Create training and validation loaders by providing current K-Fold train/validation indices to Sampler
    train_loader = DataLoader(
        train_dataset.set_tf(train_tf), sampler=SubsetRandomSampler(train_idx), **LOADER_PARAMS
    )
    
    valid_loader = DataLoader(
        train_dataset.set_tf(valid_tf), sampler=SubsetRandomSampler(val_idx), **LOADER_PARAMS
    )
    
    models = [
        smp.create_model(
            decoder, encoder_name=encoder, encoder_weights=ENCODER_WEIGHTS
        ).to(DEVICE) for encoder, decoder in encoder_decoder
    ]
    
    optimizers = [torch.optim.Adam(model_.parameters(), lr=0.0005) for model_ in models]
    # TODO add comment
    t_max = (len(train_loader.dataset) * N_EPOCHS) // train_loader.batch_size
    schedulers = [CosineAnnealingLR(optimizer_, T_max=t_max,) for optimizer_ in optimizers]
    criterion_ = smp.losses.DiceLoss(smp.losses.BINARY_MODE, from_logits=True)
    
    zipped_values = [encoder_decoder, models, optimizers, schedulers]
    
    for i in range(N_EPOCHS):
        
        pred_monitor = PredictionMonitor()
        
        # since we want to have the same transformations for every model
        train_data = [(image, mask) for image, mask in train_loader]
        valid_data = [(image, mask) for image, mask in valid_loader]
        
        for (enc, dec), model_, opt_, sched_ in zip(*zipped_values):
            
            pred_monitor.set_model(enc, dec)
    
            _, train_f1 = train_epoch(
                model_, train_data, criterion_, opt_, sched_, i + 1, monitor=pred_monitor
            )
            _, valid_f1 = valid_epoch(
                model_, valid_data, criterion_, i + 1, monitor=pred_monitor
            )
            
            eval_monitor.update_metrics_by_fold(
                setup='+'.join([enc, dec]), 
                fold=fold, 
                training_f1=train_f1,
                validation_f1=valid_f1
            )
        
        # meta learner
            
        x_train, y_train = pred_monitor.get_ensembling_for_models(encoder_decoder, mode='training')
        #clf = RandomForestClassifier(max_depth=2, n_jobs=5, random_state=SEED)
        
        #y_train_pred = x_train.mode(axis=1).to_numpy().squeeze()
        y_train_pred = (x_train.sum(axis=1) >= 1).astype(int).to_numpy()
        #clf.fit(x_train, y_train)
        train_f1 = f1_score(y_train, y_train_pred)
        
        x_val, y_val = pred_monitor.get_ensembling_for_models(encoder_decoder, mode='validation')
        #y_val_pred = x_val.mode(axis=1).to_numpy().squeeze()
        y_val_pred = (x_val.sum(axis=1) >= 1).astype(int).to_numpy()
        valid_f1 = f1_score(y_val, y_val_pred)
        
        eval_monitor.update_metrics_by_fold(
            setup='ensembling', 
            fold=fold, 
            training_f1=train_f1,
            validation_f1=valid_f1
        )
    
    eval_monitor.update_jsons()

Epoch:   1. Train.      Loss: 0.620 | f1: 0.456: 100%|██████████| 17/17 [00:04<00:00,  3.52it/s]
Epoch:   1. Validation. Loss: 0.497 | f1: 0.596: 100%|██████████| 9/9 [00:00<00:00, 12.29it/s]
Epoch:   2. Train.      Loss: 0.469 | f1: 0.650: 100%|██████████| 17/17 [00:03<00:00,  5.05it/s]
Epoch:   2. Validation. Loss: 0.451 | f1: 0.619: 100%|██████████| 9/9 [00:00<00:00, 12.17it/s]
Epoch:   1. Train.      Loss: 0.616 | f1: 0.454: 100%|██████████| 17/17 [00:03<00:00,  4.98it/s]
Epoch:   1. Validation. Loss: 0.594 | f1: 0.463: 100%|██████████| 9/9 [00:00<00:00, 12.88it/s]
Epoch:   2. Train.      Loss: 0.512 | f1: 0.616: 100%|██████████| 17/17 [00:03<00:00,  5.02it/s]
Epoch:   2. Validation. Loss: 0.471 | f1: 0.680: 100%|██████████| 9/9 [00:00<00:00, 13.37it/s]
Epoch:   1. Train.      Loss: 0.611 | f1: 0.480: 100%|██████████| 17/17 [00:03<00:00,  4.95it/s]
Epoch:   1. Validation. Loss: 0.415 | f1: 0.633: 100%|██████████| 9/9 [00:00<00:00, 13.02it/s]
Epoch:   2. Train.      Loss: 0.456 | f1

## Ensembling

CV was run on colab, the predictions saved to JSON, and now we can apply ensembling.