In [1]:
%%capture
!pip install neptune-client psutil
!git clone https://github.com/Cho-D-YoungRae/URP_PD.git
%cd URP_PD
!pwd

In [2]:
import dataset
import object_detection
from utils import *

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as FT

import os
import json
from PIL import Image
import numpy as np
import argparse
from tqdm.auto import tqdm
import time
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using <{device}> device")

Using <cuda> device


In [3]:
# ====== constants ======#
label_map = {'background': 0, 'person': 1}
rev_label_map = {v: k for k, v in label_map.items()} 

## setting

In [4]:
from torch.backends import cudnn
cudnn.benchmark = True

# ====== Random Seed Initialization ====== #
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.baselineID = 8

# ====== Dataset ====== #
args.img_type = 'lwir'
args.val_split = 0.1

# ====== Model ====== #
args.base_model = 'VGGBase'
args.n_classes = len(label_map)
args.ch_option = "mean"


# ====== Optimizer & Training ====== #
args.optim = 'Adam' 
args.lr = 5e-4
args.twice_b_lr = True
args.weight_decay = 5e-4

args.epochs = 300
args.train_batch_size = 32
args.test_batch_size = 64

args.decay_lr_at = [int(args.epochs/6)*4,
                    int(args.epochs/6)*5]
args.decay_lr_to = 0.1

## neptune init

In [5]:
import neptune.new as neptune

api_token = 
run = neptune.init(project='jodyr/urp',
                #    run='PD-25',
                   api_token=api_token)

run["parameters"] = vars(args)


https://app.neptune.ai/jodyr/urp/e/PD-28
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


## train

In [6]:
def train(train_loader, model, criterion, optimizer):
    """
    One epoch's training.

    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    print_freq = len(train_loader) // 4

    start = time.time()

    # Batches
    for i, (images, bboxes, category_ids, is_crowds) in enumerate(train_loader):
        data_time.update(time.time() - start)

        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        bboxes = [b.to(device) for b in bboxes]
        category_ids = [c.to(device) for c in category_ids]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, bboxes, category_ids)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print(f'[{i}/{len(train_loader)}]\t'
                  f'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  f'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  f'Loss {losses.val:.4f} ({losses.avg:.4f})\t')
    del predicted_locs, predicted_scores, images, bboxes, category_ids

    train_loss = losses.avg
    return train_loss        

## validation

In [7]:
def validation(val_loader, model, criterion):
    model.eval()

    num_batches = len(val_loader)
    losses = AverageMeter()
    with torch.no_grad():
        for i, (images, bboxes, category_ids, _) in enumerate(val_loader):
            images = images.to(device)
            bboxes = [b.to(device) for b in bboxes]
            category_ids = [l.to(device) for l in category_ids]

            predicted_locs, predicted_scores = model(images)
            loss = criterion(predicted_locs, predicted_scores, bboxes, category_ids).item()

            losses.update(loss, images.size(0))

    val_loss = losses.avg
    return val_loss

## checkpoint

In [8]:
checkpoint = os.path.join('/content/drive/MyDrive/2021.summer_URP/PD/checkpoint',
                          str(args.baselineID)+'.pth.tar')
checkpoint = checkpoint if os.path.isfile(checkpoint) else None
print(f"checkpoint: {checkpoint}")

checkpoint: None


In [9]:
if checkpoint is None:
    start_epoch = 1
    lr = args.lr
    model = object_detection.SSD300(n_classes=args.n_classes,
                                    base=args.base_model,
                                    one_ch_option=args.ch_option)
    if args.twice_b_lr:
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param)
                else:
                    not_biases.append(param)
        optimizer = getattr(torch.optim, args.optim)(params=[{'params': biases, 'lr': 2 * lr}, 
                                                            {'params': not_biases}],
                                                     lr=lr,
                                                     weight_decay=args.weight_decay)
    else:
        optimizer = getattr(torch.optim, args.optim)(params=model.parameters(),
                                                     lr=lr,
                                                     weight_decay=args.weight_decay)

else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']


model = model.to(device)
criterion = object_detection.MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))



Loaded base model.





## dataset init

In [10]:
train_dataset = dataset.KaistPDDataset()

total_train_size = len(train_dataset)
val_size = int(total_train_size * args.val_split)
train_val_split = [total_train_size - val_size, val_size]
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, 
                                                           train_val_split)

In [11]:
workers = 4
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=args.train_batch_size, 
                                           shuffle=True,
                                           collate_fn=dataset.collate_fn,
                                           num_workers=workers,
                                           pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_dataset, 
                                         batch_size=args.test_batch_size, 
                                         shuffle=True,
                                         collate_fn=dataset.collate_fn,
                                         num_workers=workers,
                                         pin_memory=True)

  cpuset_checked))


In [12]:
checkpoint_dir = '/content/drive/MyDrive/2021.summer_URP/PD/checkpoint'
checkpoint_path = os.path.join(checkpoint_dir,
                               str(args.baselineID)+'.pth.tar')

epochs = args.epochs
decay_lr_at = args.decay_lr_at
save_freq = 5


# Epochs
for epoch in range(start_epoch, epochs+1):
    print(f"# ====== Epoch {epoch} ====== # {datetime.now()}")
    # Decay learning rate at particular epochs
    if epoch in decay_lr_at:
        adjust_learning_rate(optimizer, args.decay_lr_to)

    # One epoch's training
    train_loss = train(train_loader=train_loader,
                        model=model,
                        criterion=criterion,
                        optimizer=optimizer)
    val_loss = validation(val_loader, model, criterion)
    print(f"*** checkpoint train loss: {train_loss}, val loss: {val_loss}***")
    run['train/loss'].log(train_loss)
    run['val/loss'].log(val_loss)
    if epoch % save_freq == 0:
        save_checkpoint(epoch, model, optimizer, checkpoint_path)



  cpuset_checked))
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[0/246]	Batch Time 93.665 (93.665)	Data Time 90.311 (90.311)	Loss 17.3149 (17.3149)	
[61/246]	Batch Time 13.593 (9.749)	Data Time 13.277 (9.375)	Loss 3.5046 (4.4473)	
[122/246]	Batch Time 0.321 (8.983)	Data Time 0.000 (8.634)	Loss 3.2278 (3.9327)	
[183/246]	Batch Time 0.327 (8.746)	Data Time 0.000 (8.406)	Loss 3.2203 (3.7295)	
[244/246]	Batch Time 24.672 (8.729)	Data Time 24.351 (8.392)	Loss 3.0410 (3.6064)	
*** checkpoint train loss: 3.6052621146172106, val loss: 3.1088067239912105***
[0/246]	Batch Time 4.666 (4.666)	Data Time 4.211 (4.211)	Loss 3.1516 (3.1516)	
[61/246]	Batch Time 0.438 (0.949)	Data Time 0.000 (0.543)	Loss 3.1121 (3.1342)	
[122/246]	Batch Time 0.408 (0.926)	Data Time 0.004 (0.518)	Loss 3.0410 (3.0642)	
[183/246]	Batch Time 0.400 (0.916)	Data Time 0.011 (0.509)	Loss 2.8376 (3.0134)	
[244/246]	Batch Time 0.343 (0.908)	Data Time 0.024 (0.501)	Loss 2.8585 (2.9748)	
*** checkpoint train loss: 2.9744872406601104, val loss: 2.8233324064012244***
[0/246]	Batch Time 4.169 (4.

Experiencing connection interruptions. Will try to reestablish communication with Neptune.
Communication with Neptune restored!


*** checkpoint train loss: 1.5254553764151968, val loss: 1.6640182049413728***
[0/246]	Batch Time 4.362 (4.362)	Data Time 3.933 (3.933)	Loss 1.5733 (1.5733)	
[61/246]	Batch Time 1.018 (1.001)	Data Time 0.622 (0.588)	Loss 1.5051 (1.5306)	
[122/246]	Batch Time 1.239 (0.956)	Data Time 0.833 (0.547)	Loss 1.4238 (1.5257)	
[183/246]	Batch Time 0.393 (0.941)	Data Time 0.001 (0.532)	Loss 1.5453 (1.5473)	
[244/246]	Batch Time 0.316 (0.931)	Data Time 0.000 (0.523)	Loss 1.3579 (1.5383)	
*** checkpoint train loss: 1.538130932459929, val loss: 1.6809024955535399***
[0/246]	Batch Time 4.439 (4.439)	Data Time 3.987 (3.987)	Loss 1.4359 (1.4359)	
[61/246]	Batch Time 0.422 (0.990)	Data Time 0.000 (0.581)	Loss 1.3140 (1.5312)	
[122/246]	Batch Time 0.396 (0.944)	Data Time 0.001 (0.539)	Loss 1.1043 (1.5325)	
[183/246]	Batch Time 0.441 (0.946)	Data Time 0.001 (0.538)	Loss 1.4361 (1.5271)	
[244/246]	Batch Time 0.318 (0.926)	Data Time 0.000 (0.521)	Loss 1.4430 (1.5198)	
*** checkpoint train loss: 1.5198112793