In [None]:
!nvidia-smi

# Init

In [8]:
!git clone --depth=1 https://github.com/AdeelH/potsdam-batch-exp.git

Cloning into 'potsdam-batch-exp'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 16 (delta 2), reused 14 (delta 1), pack-reused 0[K
Unpacking objects: 100% (16/16), done.


In [4]:
BATCH_SIZE = 16
PRETRAINED = True
LAST_CROSS = False

MODEL_ARCH = 'unet'
MODEL_BASE = 'resnet18'
CHANNEL_VARIATION = 'rgb_e'
MODEL_VARIATION = f'lc_{LAST_CROSS}'
MODEL_VARIATION += f'_add_convs'


In [5]:
EXPERIMENT_NAME = f'ss_{CHANNEL_VARIATION}_{MODEL_ARCH}_{MODEL_BASE}{"p" if PRETRAINED else ""}_{MODEL_VARIATION}_bsz_{BATCH_SIZE}'
EXPERIMENT_NAME = 'training_resume_test'
print(EXPERIMENT_NAME)

S3_BUCKET = 'raster-vision-ahassan'
S3_ROOT = f'potsdam/experiments/output/{EXPERIMENT_NAME}'

training_resume_test


In [6]:
EPOCHS = 1
LR_START = 1e-2
LR_END = 1e-4

MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4

CHECKPOINT_INTERVAL = 1
BATCH_CB_INTERVAL = 10

In [7]:
import os
from pathlib import Path
import glob
from datetime import datetime

import numpy as np
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
import torch.utils
import torchvision as tv
from torchvision import transforms as tf

from fastai.vision import *
from fastai.metrics import error_rate

import matplotlib.pyplot as plt
%reload_ext autoreload
%autoreload 2
# %matplotlib inline

In [9]:
import sys
sys.path.append('potsdam-batch-exp/')

from utils_ import *
from io_ import *
from data_ import *
from transforms import *
from models import *
from training import *
from visualizations import *
from monitoring import *
%reload_ext autoreload
%autoreload 2

In [10]:
CLASS_NAMES = ['building', 'tree', 'low-vegetation', 'clutter', 'car', 'pavement']
NCLASSES = len(CLASS_NAMES)

In [11]:
io_handler = S3IoHandler(
    local_root=EXPERIMENT_NAME, 
    s3_bucket=S3_BUCKET, 
    s3_root=S3_ROOT
)

# Data

In [28]:
# potsdam_dict = io_handler.load_pickled_file('potsdam/data/potsdam.pkl', 'data/potsdam.pkl')
with open('../../potsdam/data/potsdam.pkl', 'rb') as f:
    potsdam_dict = pickle.load(f)

## Prepare datasets

In [29]:
CHANNELS = [ch_R, ch_G, ch_B, ch_E]
CHIP_SIZE = 400
STRIDE = 200
DOWNSAMPLING = 2

TRAIN_SPLIT = 0.85

In [30]:
train_transform, val_transform, x_transform, y_transform = tfs_potsdam(channels=CHANNELS, downsampling=DOWNSAMPLING)

In [31]:
original_ds = Potsdam(potsdam_dict, chip_size=CHIP_SIZE, stride=STRIDE, tf=val_transform)
train_ds    = Potsdam(potsdam_dict, chip_size=CHIP_SIZE, stride=STRIDE, tf=train_transform, x_tf=x_transform, y_tf=y_transform)
val_ds      = Potsdam(potsdam_dict, chip_size=CHIP_SIZE, stride=STRIDE, tf=val_transform  , x_tf=x_transform, y_tf=y_transform)

### Train/val split

In [32]:
train_split_size = int((len(train_ds) * TRAIN_SPLIT) // 1)
val_split_size = len(train_ds) - train_split_size
train_split_size, val_split_size

print('train_split_size', train_split_size)
print('val_split_size', val_split_size)

inds = np.arange(len(train_ds))

train_split_size 12867
val_split_size 2271


### Samplers

In [33]:
train_sampler = torch.utils.data.SubsetRandomSampler(inds[:train_split_size])
val_sampler = torch.utils.data.SubsetRandomSampler(inds[train_split_size:])

assert len(set(train_sampler.indices) & set(val_sampler.indices)) == 0

# Model

Use FastAI to create a UNet from a Resnet18

In [12]:
def _base_model(pretrained=False):
    m = tv.models.resnet18(pretrained=pretrained)
    return m

In [13]:
body = create_body(_base_model, pretrained=PRETRAINED)
model = models.unet.DynamicUnet(body, n_classes=NCLASSES, last_cross=LAST_CROSS).cuda()

In [18]:
# io_handler.load_model_weights(
#     model, 
#     s3_path='potsdam/experiments/output/ss_rgb_unet_resnet18p_lc_False_bsz_16/best_model/best_acc', 
#     tgt_path='rgb_model'
# )

In [14]:
model[0][0] = ModifiedConv_add(model[0][0], new_conv_in_channels=1).cuda()

Freeze original, pretrained conv

In [15]:
freeze(model[0][0].original_conv)

Re-init bn layer immediately following the modified conv layer

# Train

## Training monitoring callbacks

In [16]:

def get_epoch_monitor(io_handler, chkpt_interval=1, viz_root='visualizations/per_epoch'):
    assert chkpt_interval > 0

    filter_path = f'{viz_root}/conv'
    os.makedirs(io_handler.to_local_path(filter_path), exist_ok=True)

    def _monitor(model, optimizer, sched, logs):
        epoch = len(logs['epoch']) # epoch is now 1-indexed
        val_acc = logs['val_acc'][-1]
        val_loss = logs['val_loss'][-1]
        last_best_acc = logs['best_acc'][-1] if epoch > 1 else -1
        last_best_loss = logs['best_loss'][-1] if epoch > 1 else 1e8

        if val_acc > last_best_acc:
            logs['best_acc'].append(val_acc)
            io_handler.save_model(model, f'best_model/best_acc', info=logs)
        else:
            logs['best_acc'].append(last_best_acc)

        if val_loss < last_best_loss:
            logs['best_loss'].append(val_loss)
            io_handler.save_model(model, f'best_model/best_loss', info=logs)
        else:
            logs['best_loss'].append(last_best_loss)

        if epoch % chkpt_interval == 0:
            io_handler.save_checkpoint(model, optimizer, sched, f'checkpoints/epoch_%04d' % (epoch), info=logs)

        log_str = logs_to_str(logs)
        print(log_str)

        io_handler.save_log('logs.pkl', logs)
        io_handler.save_log_str(f'logs.txt', log_str)

        title = 'epoch %04d' % (epoch)
        fs = model[0][0].new_conv.weight
        fig = viz_conv_layer_filters(fs.data, title=title, scale_each=False, padding=1)
        io_handler.save_img(fig, f'{filter_path}/epoch_%04d' % (epoch))
        plt.close(fig)

        fig = plot_lr(logs)
        io_handler.save_img(fig, f'visualizations/lr')
        plt.close(fig)

        fig = plot_losses(logs)
        io_handler.save_img(fig, f'visualizations/loss')
        plt.close(fig)

        fig = plot_accs(logs)
        io_handler.save_img(fig, f'visualizations/accuracy')
        plt.close(fig)

        stat_figs = plot_class_stats(logs)
        for stat, fig in stat_figs:
            io_handler.save_img(fig, f'visualizations/{stat}')
            plt.close(fig)

    return _monitor

def get_batch_monitor(io_handler, viz_root='visualizations/per_batch', interval=4):

    filter_path = f'{viz_root}/conv'
    grad_path = f'{viz_root}/conv_grad'
    os.makedirs(io_handler.to_local_path(filter_path), exist_ok=True)
    os.makedirs(io_handler.to_local_path(grad_path), exist_ok=True)

    def _monitor(model, epoch, batch_idx, batch, labels):
        if batch_idx % interval != 0:
            return

        fs = model[0][0].new_conv.weight

        title = 'epoch %04d, batch %05d' % (epoch, batch_idx)
        fig = viz_conv_layer_filters(fs.data, title=title, scale_each=False, padding=1)
        io_handler.save_img(fig, f'{filter_path}/epoch_%04d_batch_%05d' % (epoch, batch_idx))
        plt.close(fig)

        fig = viz_conv_layer_filters(fs.grad.data, title=title, scale_each=False, padding=1)
        io_handler.save_img(fig, f'{grad_path}/epoch_%04d_batch_%05d' % (epoch, batch_idx))
        plt.close(fig)

    return _monitor


In [17]:
epoch_callback = get_epoch_monitor(io_handler, chkpt_interval=CHECKPOINT_INTERVAL)
batch_callback = get_batch_monitor(io_handler, interval=BATCH_CB_INTERVAL)

In [36]:
train_params = {}
train_params['batch_size'] = BATCH_SIZE
train_params['val_batch_size'] = BATCH_SIZE

In [37]:
train_dl = torch.utils.data.DataLoader(train_ds, sampler=train_sampler, batch_size=train_params['batch_size']    , pin_memory=False)
val_dl   = torch.utils.data.DataLoader(val_ds  , sampler=val_sampler  , batch_size=train_params['val_batch_size'], pin_memory=False)

In [39]:
train_params = {}
train_params['epochs'] = EPOCHS
train_params['learning_rate'] = LR_START
train_params['learning_rate_min'] = LR_END

In [40]:
optimizer = optim.SGD(model.parameters(), lr=train_params['learning_rate'], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, train_params['epochs'], eta_min=train_params['learning_rate_min'])

In [41]:
from collections import defaultdict

logs = defaultdict(list)
if io_handler.checkpoint_exists():
    chkpt = io_handler.load_latest_checkpoint()
    logs = chkpt['info']
    train_params['last_epoch'] = logs['epoch'][-1]

    model.load_state_dict(chkpt['model'])
    if 'optimizer' in chkpt:
        optimizer.load_state_dict(chkpt['optimizer'])
    if 'sched' in chkpt:
        sched.load_state_dict(chkpt['sched'])

defaultdict(<class 'list'>, {'epoch': [0], 'lr_0': [0.01], 'train_loss': [1.0468148005433022e-06], 'val_loss': [tensor(6.9326e-07)], 'train_acc': [0.7737424373626709], 'val_acc': [0.8476085662841797], 'train_time': [170.73402333259583], 'val_time': [21.362993478775024], 'class_0_precision': [0.921190083026886], 'class_0_recall': [0.9726234078407288], 'class_0_fscore': [0.9618823528289795], 'class_1_precision': [0.8734885454177856], 'class_1_recall': [0.6640061736106873], 'class_1_fscore': [0.6974594593048096], 'class_2_precision': [0.7558938264846802], 'class_2_recall': [0.790364682674408], 'class_2_fscore': [0.7832212448120117], 'class_3_precision': [0.7636243104934692], 'class_3_recall': [0.41189223527908325], 'class_3_fscore': [0.4536866843700409], 'class_4_precision': [0.8048877120018005], 'class_4_recall': [0.8395473957061768], 'class_4_fscore': [0.8323786854743958], 'class_5_precision': [0.8174220323562622], 'class_5_recall': [0.894550085067749], 'class_5_fscore': [0.877981722354

In [45]:
logs['best_loss']

[]

In [42]:
plt.ioff()
train_seg(model, train_dl, val_dl, optimizer, sched, train_params, 
          epoch_callback=epoch_callback, batch_callback=batch_callback, logs=logs)

IndexError: list index out of range

In [50]:
optimizer = optim.SGD(model.parameters(), lr=train_params['learning_rate'], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, train_params['epochs'], eta_min=train_params['learning_rate_min'])

In [54]:
from collections import defaultdict

logs = defaultdict(list)
if io_handler.checkpoint_exists():
    chkpt = io_handler.load_latest_checkpoint()
    logs = chkpt['info']
    train_params['last_epoch'] = logs['epoch'][-1]
#     train_params['epochs'] -= train_params['last_epoch']

    model.load_state_dict(chkpt['model'])
    if 'optimizer' in chkpt:
        optimizer.load_state_dict(chkpt['optimizer'])
    if 'sched' in chkpt:
        sched.load_state_dict(chkpt['sched'])

In [None]:
print(optimizer)
print(sched)

In [None]:
plt.ioff()
train_seg(model, train_dl, val_dl, optimizer, sched, train_params, 
          epoch_callback=epoch_callback, batch_callback=batch_callback, logs=logs)

# Create animations

In [None]:
# !apt update -y
# !apt install ffmpeg -y

In [None]:
# !ffmpeg -framerate 60 -pattern_type glob -i "$EXPERIMENT_NAME/visualizations/per_batch/conv/*.png" -c:v libx264 \
#     -pix_fmt yuv420p "$EXPERIMENT_NAME/visualizations/per_batch/conv/conv.mp4" -y;
# !ffmpeg -framerate 60 -pattern_type glob -i "$EXPERIMENT_NAME/visualizations/per_batch/conv_grad/*.png" -c:v libx264 \
#     -pix_fmt yuv420p "$EXPERIMENT_NAME/visualizations/per_batch/conv_grad/conv_grad.mp4" -y;

In [None]:
# io_handler.upload_file('visualizations/per_batch/conv/conv.mp4', 'visualizations/per_batch/conv.mp4')
# io_handler.upload_file('visualizations/per_batch/conv_grad/conv_grad.mp4', 'visualizations/per_batch/conv_grad.mp4')