In [None]:
%%capture
!pip install neptune-client psutil
!git clone https://github.com/Cho-D-YoungRae/URP_PD.git
%cd URP_PD
!pwd

In [None]:
import dataset
import object_detection
from utils import *

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as FT

import os
import json
from PIL import Image
import numpy as np
import argparse
from tqdm.auto import tqdm
import time
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using <{device}> device")

Using <cuda> device


In [None]:
# ====== constants ======#
label_map = {'background': 0, 'person': 1}
rev_label_map = {v: k for k, v in label_map.items()} 

In [None]:
from torch.backends import cudnn
cudnn.benchmark = True

# ====== Random Seed Initialization ====== #
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.baselineID = 3

# ====== Dataset ====== #
args.img_type = 'lwir'
args.val_split = 0.1

# ====== Model ====== #
args.base_model = 'VGGBase'
args.n_classes = len(label_map)
args.one_ch_option = "mean"


# ====== Optimizer & Training ====== #
args.optim = 'Adam' 
args.lr = 5e-4
args.twice_b_lr = True
args.weight_decay = 5e-4

args.epochs = 150
args.train_batch_size = 32
args.test_batch_size = 64

args.decay_lr_at = [int(args.epochs/6)*4,
                    int(args.epochs/6)*5]
args.decay_lr_to = 0.1

In [None]:

import neptune.new as neptune

api_token = 
run = neptune.init(project='jodyr/urp',
                   api_token=api_token)

run["parameters"] = vars(args)


https://app.neptune.ai/jodyr/urp/e/PD-10
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


-----
## code 작성

In [None]:
def train(train_loader, model, criterion, optimizer):
    """
    One epoch's training.

    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    print_freq = 200

    start = time.time()

    # Batches
    for i, (images, bboxes, category_ids, is_crowds) in enumerate(train_loader):
        data_time.update(time.time() - start)

        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        bboxes = [b.to(device) for b in bboxes]
        category_ids = [c.to(device) for c in category_ids]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, bboxes, category_ids)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print(f'[{i}/{len(train_loader)}]\t'
                  f'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  f'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  f'Loss {losses.val:.4f} ({losses.avg:.4f})\t')
    del predicted_locs, predicted_scores, images, bboxes, category_ids

    train_loss = losses.avg
    return train_loss        

In [None]:
def validation(val_loader, model, criterion):
    model.eval()

    num_batches = len(val_loader)
    losses = AverageMeter()
    with torch.no_grad():
        for i, (images, bboxes, category_ids, _) in enumerate(val_loader):
            images = images.to(device)
            bboxes = [b.to(device) for b in bboxes]
            category_ids = [l.to(device) for l in category_ids]

            predicted_locs, predicted_scores = model(images)
            loss = criterion(predicted_locs, predicted_scores, bboxes, category_ids).item()

            losses.update(loss, images.size(0))

    val_loss = losses.avg
    return val_loss

In [None]:
checkpoint = os.path.join('/content/drive/MyDrive/2021.summer_URP/PD/checkpoint',
                          str(args.baselineID)+'.pth.tar')
checkpoint = checkpoint if os.path.isfile(checkpoint) else None
print(f"checkpoint: {checkpoint}")

checkpoint: None


In [None]:
if checkpoint is None:
    start_epoch = 0
    lr = args.lr
    model = object_detection.SSD300(n_classes=args.n_classes,
                                    base=args.base_model,
                                    one_ch_option=args.one_ch_option)
    if args.twice_b_lr:
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param)
                else:
                    not_biases.append(param)
        optimizer = getattr(torch.optim, args.optim)(params=[{'params': biases, 'lr': 2 * lr}, 
                                                            {'params': not_biases}],
                                                     lr=lr,
                                                     weight_decay=args.weight_decay)
    else:
        optimizer = getattr(torch.optim, args.optim)(params=model.parameters(),
                                                     lr=lr,
                                                     weight_decay=args.weight_decay)

else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']


model = model.to(device)
criterion = object_detection.MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))



Loaded base model.





In [None]:
train_dataset = dataset.KaistPDDataset()

total_train_size = len(train_dataset)
val_size = int(total_train_size * args.val_split)
train_val_split = [total_train_size - val_size, val_size]
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, 
                                                           train_val_split)

In [None]:
workers = 4
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=args.train_batch_size, 
                                           shuffle=True,
                                           collate_fn=dataset.collate_fn,
                                           num_workers=workers,
                                           pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_dataset, 
                                         batch_size=args.test_batch_size, 
                                         shuffle=True,
                                         collate_fn=dataset.collate_fn,
                                         num_workers=workers,
                                         pin_memory=True)

  cpuset_checked))


In [12]:
checkpoint_dir = '/content/drive/MyDrive/2021.summer_URP/PD/checkpoint'
checkpoint_path = os.path.join(checkpoint_dir,
                               str(args.baselineID)+'.pth.tar')

epochs = args.epochs
decay_lr_at = args.decay_lr_at
val_save_freq = 5
train_losses = []
val_losses = []
ckpt_loss = float('inf')
ckpt_epoch = -1


# Epochs
for epoch in range(start_epoch, epochs):
    print(f"# ====== Epoch {epoch} ====== # {datetime.now()}")
    # Decay learning rate at particular epochs
    if epoch in decay_lr_at:
        adjust_learning_rate(optimizer, args.decay_lr_to)

    # One epoch's training
    train_loss = train(train_loader=train_loader,
                        model=model,
                        criterion=criterion,
                        optimizer=optimizer)

    # validation & save
    if epoch % val_save_freq == 0:
        val_loss = validation(val_loader, model, criterion)
        print(f"train loss: {train_loss}, val loss: {val_loss}")
        run['train/loss'].log(train_loss)
        run['val/loss'].log(val_loss)
        if val_losses and val_losses[-1] >= val_loss:
        # 아래 코드로 바꾸기
        # if val_losses and ckpt_loss >= val_loss: 
            # ckpt_loss = val_loss
            # ckpt_epoch = epoch
            print(f"save checkpoint at [Epoch {epoch}]")
            save_checkpoint(epoch, model, optimizer, checkpoint_path)
        train_losses.append(train_loss)
        val_losses.append(val_loss)



  cpuset_checked))
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[0/246]	Batch Time 97.577 (97.577)	Data Time 91.718 (91.718)	Loss 17.4892 (17.4892)	
[200/246]	Batch Time 19.145 (5.723)	Data Time 18.509 (5.142)	Loss 2.9188 (3.5546)	
train loss: 3.4317653060520787, val loss: 2.7592808055986926
[0/246]	Batch Time 2.833 (2.833)	Data Time 2.160 (2.160)	Loss 2.6549 (2.6549)	
[200/246]	Batch Time 0.610 (0.625)	Data Time 0.002 (0.014)	Loss 2.3388 (2.5012)	
[0/246]	Batch Time 2.735 (2.735)	Data Time 2.022 (2.022)	Loss 2.2403 (2.2403)	
[200/246]	Batch Time 0.648 (0.629)	Data Time 0.018 (0.013)	Loss 1.9286 (2.0271)	
[0/246]	Batch Time 2.794 (2.794)	Data Time 2.021 (2.021)	Loss 1.3918 (1.3918)	
[200/246]	Batch Time 0.657 (0.627)	Data Time 0.002 (0.013)	Loss 1.8319 (1.7199)	
[0/246]	Batch Time 2.708 (2.708)	Data Time 2.014 (2.014)	Loss 1.5146 (1.5146)	
[200/246]	Batch Time 0.585 (0.630)	Data Time 0.002 (0.013)	Loss 1.5182 (1.4805)	
[0/246]	Batch Time 2.662 (2.662)	Data Time 1.866 (1.866)	Loss 1.4758 (1.4758)	
[200/246]	Batch Time 0.596 (0.629)	Data Time 0.001 (