In [2]:
!pip install -r requirements.txt -q

In [39]:
%load_ext autoreload
%autoreload 2

import os
from os import listdir
import random
from tqdm import tqdm

import cv2
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.transforms import v2
from torchvision.datasets import ImageFolder

import segmentation_models_pytorch as smp
from segmentation_models_pytorch.encoders import get_preprocessing_fn
# import albumentations as A

import matplotlib.pyplot as plt
import wandb
import mlflow

# segmentation losses & metrics
from utils import FocalLoss
from utils import mIoULoss
from utils import seg_acc as acc
from utils import SegDataset, train

os.environ['MLFLOW_TRACKING_USERNAME'] = 'Eighonet'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '507106c5c84dd39e349e6dbe8bf63f4aa36c2a0b'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print(device)
print(torch.cuda.get_device_name())
print(torch.cuda.device_count())

cuda
Tesla T4
1


First of all, let us make this whole story truly reproducible (as it supposed to be everywhere in science).

In [13]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def seed(value:int) -> None:
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

seed(0)

So, how we are going to make conclusions about observed metrics for various configurations of inputs, models, hyperparameters, etc.? In our case we will repeat training for the fixed (in some sense) configuration N times to obtain the metric sample. Following this idea, each model will have metric samples associated with them; further they can be compared by the Mann-Whitney U test indicating presence of statistical difference between them. 

About fixed configurations -- there are several possible approaches based on certain degrees of freedom:

1. Train data can be fixed for each run in the sample or not.
2. Seed value (which directly defines the initial weights of the model, optimizer, etc.) can be fixed for each run or not.

In this notebook I am about to use the same data split but each model of the sample will be trained with its own seed. 

### Augmentation & pre-training influence

We will begin with something simple: naturally, the pre-training and relevant augmentations shoud have positive impact on the learning dynamics. Let's find out!

Since our hypothesis is rather simple, it will be appropriate to test it on something with fast convergence (like the classic UNet architecture).

In [15]:
preprocess_input = get_preprocessing_fn('resnet34', pretrained='imagenet')

In [16]:
transform_input = v2.Compose([
#    v2.RandomCrop(128),
#    v2.RandomHorizontalFlip(),
#    v2.RandomRotation(degrees=(0, 180)),
    get_preprocessing_fn('resnet34', pretrained='imagenet'),
#color_shift = transforms.ColorJitter(.1,.1,.1,.1)
#blurriness = transforms.GaussianBlur(3, sigma=(0.1, 2.0))
    v2.ToTensor()
])

transform_mask = v2.Compose([
    v2.ToTensor()
])

In [17]:
train_dataset = SegDataset(inputs_path='processed_data/train/images/',
                              targets_path='processed_data/train/masks/',
                              transform_input=transform_input,
                              transform_mask=transform_mask)

val_dataset = SegDataset(inputs_path='processed_data/val/images/',
                              targets_path='processed_data/val/masks/',
                              transform_input=transform_input,
                              transform_mask=transform_mask)

test_dataset = SegDataset(inputs_path='processed_data/test/images/',
                              targets_path='processed_data/test/masks/',
                              transform_input=transform_input,
                              transform_mask=transform_mask)

train_dataloader = DataLoader(dataset=train_dataset,
                                      batch_size=4,
                                      shuffle=True)

val_dataloader = DataLoader(dataset=val_dataset,
                                      batch_size=4,
                                      shuffle=True)

test_dataloader = DataLoader(dataset=test_dataset,
                                      batch_size=4,
                                      shuffle=True)

There will be 3 models compared: random weights, pre-trained, pre-trained + augmented.

In [40]:
SAMPLE_SIZE = 8
N_EPOCHS = 30

focal_criterion = FocalLoss(gamma=1.75).to(device)
jaccard_criterion = mIoULoss(n_classes=2).to(device)

#### Random vs pre-trained

In [38]:
optimizer_r = torch.optim.Adam(model_r.parameters(), lr=1e-4)
best_rs = []
for sample_id in range(SAMPLE_SIZE):
    model_r = smp.Unet(
        encoder_name="resnet34",    
        encoder_weights=None,     
        in_channels=3,                  
        classes=2,                      
    ).to(device)
    
    optimizer_r = torch.optim.Adam(model_r.parameters(), lr=1e-4)
    lr_scheduler_r = torch.optim.lr_scheduler.StepLR(optimizer_r, step_size=1, gamma=0.5)
    
    best_r = train(f'UNet_random_{sample_id}',
                   model_r,
                   optimizer_r,
                   lr_scheduler_r,
                   focal_criterion,
                   train_dataloader,
                   val_dataloader,
                   N_EPOCHS,
                   device)
    best_rs.append(best_r)

 44%|████▍     | 22/50 [03:01<03:50,  8.23s/it]

lowering learning rate to 5e-05


 56%|█████▌    | 28/50 [03:50<03:01,  8.23s/it]

lowering learning rate to 2.5e-05


 68%|██████▊   | 34/50 [04:40<02:11,  8.21s/it]

lowering learning rate to 1.25e-05


 80%|████████  | 40/50 [05:29<01:22,  8.22s/it]

lowering learning rate to 6.25e-06


 90%|█████████ | 45/50 [06:10<00:41,  8.20s/it]

lowering learning rate to 3.125e-06


100%|██████████| 50/50 [07:03<00:00,  8.46s/it]
 48%|████▊     | 24/50 [03:20<03:33,  8.22s/it]

lowering learning rate to 5e-05


 60%|██████    | 30/50 [04:09<02:44,  8.21s/it]

lowering learning rate to 2.5e-05


 72%|███████▏  | 36/50 [04:58<01:55,  8.25s/it]

lowering learning rate to 1.25e-05


 84%|████████▍ | 42/50 [05:48<01:06,  8.29s/it]

lowering learning rate to 6.25e-06


 84%|████████▍ | 42/50 [05:55<01:07,  8.48s/it]


KeyboardInterrupt: 

1. What is the best (among the tested models) encoder-decoder for the selected task? Does it have a statistically significant difference regarding the other trained models?
2. Does the pre-training has a statistically significant influence on the metric?

Future work:

**TBD**

In [None]:
from segmentation_models_pytorch.encoders import get_preprocessing_fn

preprocess_input = get_preprocessing_fn('resnet34', pretrained='imagenet')