Code used to train a representative EfficientDet model using seven videos as validation and the other 25 for training.

Variations on the basic model usually involve changing the preprocessing steps.

In [None]:
import numpy as np
import pandas as pd

import os
import cv2
import matplotlib.pyplot as plt
from glob import glob

import time
import random
import warnings
import torch
from torch.utils.data import Dataset
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from datetime import datetime

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

DRIVE = "/content/drive/MyDrive/CNN/UAV"
IMAGE_ZIP = "images.zip"
IMAGES = f"{DRIVE}/{IMAGE_ZIP}"
ANNOTATION_DIR = f"{DRIVE}/annotations"
BASE_IMAGE_DIR = f"/content/images"
ANNOTATION_FILES = [f"{ANNOTATION_DIR}/{d}" for d in os.listdir(ANNOTATION_DIR) if ".csv" in d]
EDETS = [d for d in os.listdir(DRIVE) if '.pth' in d]
ANNOTATION_FILES.sort()
EDETS.sort()
os.environ["DRIVE"] = DRIVE
os.environ["IMAGE_ZIP"] = IMAGE_ZIP
os.environ["IMAGES"] = IMAGES
os.environ["ANNOTATION_DIR"] = ANNOTATION_DIR
os.environ["BASE_IMAGE_DIR"] = BASE_IMAGE_DIR


Load Image Data and install packages.

In [None]:
%%bash

cp $IMAGES /content
unzip -q $IMAGE_ZIP
rm $IMAGE_ZIP

pip install -U -q albumentations
pip install -q omegaconf
pip install -q timm
pip install -q effdet

In [None]:
IMAGE_DIRS = [f"{BASE_IMAGE_DIR}/{d}" for d in os.listdir(BASE_IMAGE_DIR) if d != '.DS_Store']
IMAGE_DIRS.sort()

In [None]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from effdet import get_efficientdet_config, EfficientDet, DetBenchTrain, DetBenchPredict
from effdet.efficientdet import HeadNet

TRAIN_SIZE = 512
VAL_SIZE = 512
NUM_CLASSES = 1
BATCH_SIZE = 8
NUM_WORKERS = 4
D_SIZE = 0

EPOCH = 0
MAX_EPOCH = 100

LOG_LR = 4
COEFF_LR = 2

Set global configurations. The most important is project_folder, followed by the learning rate parameters.

Model will print output every 250 steps.

In [None]:
class TrainGlobalConfig:
  def __init__(self, d_size,
               num_workers,
               project_folder,
               batch_size, n_epochs,
               log_lr, coeff_lr):

    self.num_workers = num_workers
    self.batch_size = batch_size
    self.n_epochs = n_epochs
    self.lr = coeff_lr * 10**(-log_lr)

    self.folder = f"{DRIVE}/{project_folder}"

    self.verbose = True
    self.verbose_step = 250

    self.SchedulerClass = torch.optim.lr_scheduler.MultiplicativeLR
    self.scheduler_params = dict(
        lr_lambda = lambda epoch: 0.94**0.25,
        verbose=True
    )
global_config = TrainGlobalConfig(batch_size=BATCH_SIZE,
                                  n_epochs=MAX_EPOCH,
                                  d_size=D_SIZE,
                                  num_workers=NUM_WORKERS,
                                  project_folder=f"effdet{D_SIZE}-{LOG_LR}-{COEFF_LR}_wd4-5_512x640_ap",
                                  log_lr=LOG_LR,
                                  coeff_lr=COEFF_LR)

# Load and Transform Data

Splits the data into training and validation sets so that the valiation set has 20% of the videos and 20% of the images.

In [None]:
def get_train_and_val(rand=6189):
  annotation_list = []
  seq_list = []
  col_names = ["frame_id", "object_id", "x", "y", "width", "height", "object_class",
               "species", "occluded", "noisy_frame"]
  for annot, image_dir in zip(ANNOTATION_FILES, IMAGE_DIRS):
    if ".csv" in annot:
      annotation_list.append(pd.read_csv(annot, header=None,
                         names=col_names))
      annotation_list[-1]['csv'] = annot.split("/")[-1]
      images = [f"{image_dir}/{d}" for d in os.listdir(image_dir)]
      img_shape = cv2.imread(images[0]).shape[:2]
      frames = [int(img.split("_")[-1].split(".")[0]) for img in images]
      frame_df = pd.DataFrame({"frame_id": frames, "file": images})
      frame_df["csv"] = f"{image_dir.split('/')[-1]}.csv"
      frame_df["img_height"] = img_shape[0]
      frame_df["img_width"] = img_shape[1]
      seq_list.append(frame_df)
  frame_df = pd.concat(seq_list).reset_index(drop=True)
  annotations = pd.concat(annotation_list).merge(frame_df).reset_index(drop=True)
  
  while True:
    train_files, val_files = train_test_split(frame_df[['csv']].drop_duplicates(),
                                              test_size=0.2,
                                              random_state=rand)
    val_df = frame_df[frame_df['csv'].isin(val_files['csv'])].reset_index(drop=True)
    train_df = frame_df[frame_df['csv'].isin(train_files['csv'])].reset_index(drop=True)
    val_frac = val_df['file'].nunique()/frame_df['file'].nunique()
    if val_frac < 0.22 and val_frac > 0.18:
        break
    rand += 1
  train_annotations = annotations[annotations['csv'].isin(train_files['csv'])].reset_index(drop=True)
  val_annotations = annotations[annotations['csv'].isin(val_files['csv'])].reset_index(drop=True)
  return train_df, val_df, train_annotations, val_annotations
train_df, val_df, train_annotations, val_annotations = get_train_and_val()

**Transformations**

Training transforms consist of crop and resize followed bu flips.

Validation transform consists of rescaling and, if necessary, padding.

In [None]:
BBOX = A.BboxParams(
             format='pascal_voc',
             min_area=0, 
             min_visibility=0,
             label_fields=['labels'])

def get_train_transforms(img_size):
  """Returns a function to perform the standard sequence of preprocessing steps
     for training data.
  """
  return A.Compose([A.RandomResizedCrop(height=img_size[0], width=img_size[1],
                                        scale=(0.1, 1.0),
                                        ratio=(3/4, 4/3),
                                        p=1.0),
                    A.HorizontalFlip(p=0.5),
                    A.VerticalFlip(p=0.5),
                    A.RandomRotate90(p=1.0),
                    A.Transpose(p=0.5),
                    ToTensorV2(p=1.0)],
                   bbox_params=BBOX, 
                   p=1.0)

def get_val_transform(img_size):
  """Returns a function to perform the standard sequence of preprocessing steps
     for validation data.
  """
  max_size = max(img_size)
  return A.Compose([A.LongestMaxSize(max_size=640,
                                     p=1.0),
                    A.PadIfNeeded(min_height=512,
                                  min_width=640,
                                  border_mode=0,
                                  p=1.0),
                    ToTensorV2(p=1.0)],
                   bbox_params=BBOX, 
                   p=1.0)
  
def get_default_transform(img_size):
  """Returns a function to perform the default transform if the training
     transform fails.
  """
  return A.Compose([A.Resize(height=img_size[0],
                             width=img_size[1], p=1.0),
                    ToTensorV2(p=1.0)], 
                   bbox_params=BBOX,
                   p=1.0)

**Generic UAV Dataset**

In [None]:
class UAVDataset(Dataset):
  
  def __init__(self, meta_data,
               boxes,
               transform=None,
               image_size=None,
               train=False):
    super(UAVDataset).__init__()
    
    self.meta_data = meta_data
    self.boxes = boxes
    if type(image_size) == int:
      self.image_size = (image_size, image_size)
    else:
      self.image_size = image_size
    self.transform = transform(self.image_size) if transform else None
    self.train = train
    
  def _box_to_tensor(self, sample, target):
    """Convert boundind box array to tensor"""
    if len(sample["bboxes"]) > 0:
      target["bboxes"] = torch.tensor(sample["bboxes"])
    else:
      target["bboxes"] = torch.zeros((0,4))
      # Convert bounded box to yxyx format
    if self.train:
      target["bboxes"][:,[0,1,2,3]] = target["bboxes"][:,[1,0,3,2]]
    return target
  
  def __len__(self) -> int:
    """Returns the number of images."""
    return self.meta_data.shape[0]

  def load_image_and_boxes(self, image_meta, image_boxes):
    """Loads image corresponding to image_meta row.
       Converts bounding boxes to x_min, y_min, x_max, y_max format.
    """
    image = cv2.imread(image_meta["file"]).astype(np.float32)/ 255.0
    bboxes = image_boxes[["x", "y", "width", "height"]].values
    bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
    bboxes[bboxes[:, 2] > image_meta["img_width"], 2] = image_meta["img_width"]
    bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
    bboxes[bboxes[:, 3] > image_meta["img_height"], 3] = image_meta["img_height"]
    return image, bboxes

**Training Data**

In [None]:
class TrainDataset(UAVDataset):

  def __init__(self, meta_data,
               boxes,
               image_size=None,
               transform=None,
               default_transform=None,
               max_iter=30):
    super(TrainDataset, self).__init__(meta_data, boxes, transform, image_size, train=True)
    self.default_transform = default_transform(self.image_size) if default_transform else None
    self.max_iter = max_iter

  def __getitem__(self, index: int):
    """Retrieves the image and boxes with the specified index."""
    image_meta = self.meta_data.loc[index]
    image_boxes = self.boxes[self.boxes["file"] == image_meta["file"]]
    image, bboxes = self.load_image_and_boxes(image_meta, image_boxes)
    labels = torch.ones((bboxes.shape[0]), dtype=torch.int64)
    target = {"bboxes": torch.tensor(bboxes),
              "labels": labels}
    if self.transform and target["bboxes"].shape[0] == 0:
      sample = self.transform(image=image,
                              bboxes=target["bboxes"],
                              labels=target["labels"])
      image, target = sample["image"], self._box_to_tensor(sample, target)  
      return image, target
    elif self.transform:
      for i in range(self.max_iter):
        sample = self.transform(image=image,
                                bboxes=target["bboxes"],
                                labels=target["labels"])
        if len(sample["bboxes"]) > 0:
          image, target= sample["image"], self._box_to_tensor(sample, target)
          target["labels"] = torch.stack(sample["labels"])
          return image, target
    if self.default_transform and image.shape[2] == 3:
      sample = self.default_transform(image=image,
                                      bboxes=target["bboxes"],
                                      labels=target["labels"])
      image, target = sample["image"], self._box_to_tensor(sample, target)
    return image, target

**Validation Data**

In [None]:
class ValDataset(UAVDataset):

  def __init__(self, meta_data,
               boxes,
               image_size,
               transform=None,
               train=False):
    super(ValDataset, self).__init__(meta_data, boxes, transform, image_size, train)

  def __getitem__(self, index: int):
    """Retrieves the image and boxes with the specified index."""
    image_meta = self.meta_data.loc[index]
    image_boxes = self.boxes[self.boxes["file"] == image_meta["file"]]
    image, bboxes = self.load_image_and_boxes(image_meta, image_boxes)
    labels = torch.ones(bboxes.shape[0], dtype=torch.int64)
    target = {"bboxes": bboxes,
              "labels": labels}
    
    if self.transform:
      sample = self.transform(image=image,
                              bboxes=target["bboxes"],
                              labels=target["labels"])
      image, target = sample['image'], self._box_to_tensor(sample, target)
    return image, target

Constructs a PyTorch dataloader from the datasets.

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch)) 

train_dataset = TrainDataset(meta_data=train_df,
                             boxes=train_annotations,
                             image_size=TRAIN_SIZE,
                             transform=get_train_transforms,
                             default_transform=get_default_transform)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=global_config.batch_size,
    num_workers=global_config.num_workers,
    sampler=RandomSampler(train_dataset),
    pin_memory=False,
    drop_last=True,
    collate_fn=collate_fn)


val_loaders = {}
for csv in val_df["csv"].unique():
  val = val_df[val_df["csv"] == csv].reset_index(drop=True)
  img_height, img_width = val["img_height"].iloc[0], val["img_width"].iloc[0]
  annot = val_annotations[val_annotations["csv"] == csv]
  val_dataset = ValDataset(meta_data=val,
                           boxes=annot,
                           image_size = (img_height, img_width),
                           transform=get_val_transform,
                           train=True)
  val_loader = torch.utils.data.DataLoader(
      val_dataset, 
      batch_size=global_config.batch_size,
      num_workers=global_config.num_workers,
      sampler=SequentialSampler(val_dataset),
      shuffle=False,
      pin_memory=False,
      collate_fn=collate_fn)
  val_loaders[csv] = val_loader

Class to track training and validation metrics.

In [None]:
class AverageMeter:
  """Computes and stores the average and current value"""
  def __init__(self):
    self.reset()

  def reset(self):
    self.current = 0
    self.avg = 0
    self.sum = 0
    self.count = 0

  def update(self, val, n=1, avg=False):
    self.current = val
    self.sum += val * n if avg else val
    self.count += n
    self.avg = self.sum / self.count
    
  def concat(self, other_meter):
    self.current += other_meter.current
    self.sum += other_meter.sum
    self.count += other_meter.count
    self.avg = self.sum / self.count
        
class LossMeter():
  def __init__(self, loss=None, class_loss=None, box_loss=None):
    self.loss = loss if loss is not None else AverageMeter()
    self.class_loss = class_loss if class_loss is not None else AverageMeter()
    self.box_loss = box_loss if box_loss is not None else AverageMeter()

  def update(self, output, n=1, avg=False):
    self.loss.update(output['loss'].detach().item(), n, avg)
    self.class_loss.update(output['class_loss'].detach().item(), n, avg)
    self.box_loss.update(output['box_loss'].detach().item(), n, avg)

  def concat(self, other_meter):
    self.loss.concat(other_meter.loss)
    self.class_loss.concat(other_meter.class_loss)
    self.box_loss.concat(other_meter.box_loss)

Trains the model.

In [None]:
class Fitter:
    
    def __init__(self, model, val_model, device, config, start_epoch=0):
      self.config = config
      self.start_epoch = start_epoch
      
      self.base_dir = config.folder
      if not os.path.exists(self.base_dir):
        os.makedirs(self.base_dir)
        
      self.log_path = f"{self.base_dir}/log.txt"
      self.best_summary_loss = None

      self.model = model
      self.val_model = val_model
      self.device = device

      self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr,
                                         weight_decay=4e-5)
      self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params)
      self.log(f"Fitter prepared. Device is {self.device}")

    def _print_line(self, summary_loss, step, total_steps, stage, t):
      print(
          f"{stage} Step {step}/{total_steps}, " + \
          f"summary_loss: {summary_loss.loss.avg:.5f}, " + \
          f"class_loss: {summary_loss.class_loss.avg:.5f}, " + \
          f"box_loss: {summary_loss.box_loss.avg:.5f}, " + \
          f"time: {(time.time() - t):.5f}")
      
    def _log_line(self, summary_loss, epoch, stage, t):
      return f"{stage} Epoch: {epoch}, " + \
             f"summary loss: {summary_loss.loss.avg:.5f}, " +\
             f"class loss: {summary_loss.class_loss.avg:.5f}, " + \
             f"box_loss: {summary_loss.box_loss.avg:.5f}, " + \
             f"time: {(time.time() - t):.5f}"

    def _avg_loss(self, loss_list):
      loss, class_loss, box_loss = AverageMeter(), AverageMeter(), AverageMeter()
      for l in loss_list:
        loss.update(l.loss.avg)
        class_loss.update(l.class_loss.avg)
        box_loss.update(l.box_loss.avg)
        
      return LossMeter(loss, class_loss, box_loss)

    

    def fit(self, train_loader, validation_loaders):
      if self.start_epoch > 0 and not self.best_summary_loss:
        self.best_summary_loss = self.validation(validation_loader)

      for epoch in range(self.start_epoch, self.config.n_epochs):
        if self.config.verbose:
          lr = self.optimizer.param_groups[0]["lr"]
          timestamp = datetime.utcnow().isoformat()
          self.log(f"\n{timestamp}\nLR: {lr}")

        t = time.time()
        summary_loss = self.train_one_epoch(train_loader)

        self.log(self._log_line(summary_loss, epoch, "Train", t))
        self.save(f"{self.base_dir}/last-checkpoint.bin", epoch)

        t = time.time()
        self.val_model.model.load_state_dict(self.model.model.state_dict())
        ## Each validation video is processed separately
        summary_losses = {k: self.validation(k, vl) for k, vl in validation_loaders.items()}

        for k, sl in summary_losses.items():
          self.log(self._log_line(sl, epoch, f"Val {k}", t))
        total_summary_loss = self._avg_loss(summary_losses.values())
        self.log(self._log_line(total_summary_loss, epoch, "Val", t))
       
        if not self.best_summary_loss or total_summary_loss.loss.avg < self.best_summary_loss:
          self.best_summary_loss = total_summary_loss.loss.avg
          self.model.eval()
          self.save(f"{self.base_dir}/best-checkpoint-{str(epoch).zfill(3)}epoch.bin", epoch)
          for path in sorted(glob(f"{self.base_dir}/best-checkpoint-*epoch.bin"))[:-3]:
            os.remove(path)

        self.scheduler.step()

    def train_one_epoch(self, train_loader):
      self.model.train()
      summary_loss = LossMeter()
      t = time.time()
      for step, (images, targets) in enumerate(train_loader):
        if self.config.verbose and step % self.config.verbose_step == 0:
          self._print_line(summary_loss, step, len(train_loader), "Train", t)
        images = torch.stack(images)
        images = images.to(self.device).float()
        batch_size = images.shape[0]
        bboxes = [target["bboxes"].to(self.device).float() for target in targets]
        labels = [target["labels"].to(self.device).float() for target in targets]

        self.optimizer.zero_grad()
        
        output = self.model(images, {"bbox": bboxes, "cls": labels})
            
        output["loss"].backward()

        summary_loss.update(output, batch_size, avg=True)

        self.optimizer.step()
        
      return summary_loss

    def validation(self, csv, val_loader):
      self.val_model.eval()
      summary_loss = LossMeter()
      t = time.time()
      for step, (images, targets) in enumerate(val_loader):
        with torch.no_grad():
          images = torch.stack(images)
          batch_size = images.shape[0]
          images = images.to(self.device).float()
          bboxes = [target["bboxes"].to(self.device).float() for target in targets]
          labels = [target["labels"].to(self.device).float() for target in targets]

          output = self.val_model(images, {"bbox": bboxes, "cls": labels})
          summary_loss.update(output, batch_size, avg=True)
      return summary_loss


    
    def save(self, path, epoch):
      self.model.eval()
      torch.save({
          "model_state_dict": self.model.model.state_dict(),
          "optimizer_state_dict": self.optimizer.state_dict(),
          "scheduler_state_dict": self.scheduler.state_dict(),
          "best_summary_loss": self.best_summary_loss,
          "epoch": epoch,
          }, path)

    def load(self, path):
      checkpoint = torch.load(path)
      self.model.model.load_state_dict(checkpoint["model_state_dict"])
      self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
      self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
      self.best_summary_loss = checkpoint["best_summary_loss"]
      self.start_epoch = checkpoint["epoch"] + 1
        
    def log(self, message):
      if self.config.verbose:
        print(message)
      with open(self.log_path, "a+") as logger:
        logger.write(f"{message}\n")

Extension of the benchmark training class. Needed to correct an unfixed bug in the repository.

In [None]:
class ExtendDetBenchTrain(DetBenchTrain):

  def __init__(self, model, config):
    super(ExtendDetBenchTrain, self).__init__(model, config)

  def forward(self, x, target):
    class_out, box_out = self.model(x)
    cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors(
        target['bbox'], target['cls'])
    loss, class_loss, box_loss = self.loss_fn(class_out, box_out, cls_targets, box_targets, num_positives)
    output = dict(loss=loss, class_loss=class_loss, box_loss=box_loss)
    return output

Loads EfficientDet.

A lot of config copying because the original config gets frozen when teh model is created.

Creates separate valiation models for each validation dataset since they each need their own input dimensions.

In [None]:
def get_net(epoch=EPOCH,
            global_config=global_config,
            num_classes=NUM_CLASSES,
            image_size=TRAIN_SIZE,
            d_size=D_SIZE):
  device = torch.device('cuda:0')
  enet_config = get_efficientdet_config(f'tf_efficientdet_d{d_size}_ap')
  val_config = enet_config.copy()
  enet_config.image_size = [image_size, image_size]
  val_config.image_size = [512, 640]
  copy_config = enet_config.copy()
  copy_config.num_classes = num_classes
  net = EfficientDet(enet_config, pretrained_backbone=False)
  val_net = EfficientDet(val_config, pretrained_backbone=False)
  val_net.reset_head(num_classes=copy_config.num_classes)
  val_net = ExtendDetBenchTrain(val_net, copy_config)
  val_net.to(device)
  if epoch == 0:
    checkpoint = torch.load(f"{DRIVE}/{EDETS[d_size]}")
    net.load_state_dict(checkpoint)
    net.reset_head(num_classes=copy_config.num_classes)
    net = ExtendDetBenchTrain(net, copy_config)
    net.to(device)
    fitter = Fitter(model=net,
                    val_model=val_net,
                    device=device, config=global_config, start_epoch=0)
  else:
    net.reset_head(num_classes=copy_config.num_classes)
    net = DetBenchTrain(net, copy_config)
    net.to(device)
    fitter = Fitter(model=net,
                    val_model=val_net,
                    device=device, config=global_config, start_epoch=0)
    fitter.load(f"{global_config.folder}/best-checkpoint-{str(epoch).zfill(3)}epoch.bin")
    

  return fitter

fitter = get_net()

Adjusting learning rate of group 0 to 2.0000e-04.
Fitter prepared. Device is cuda:0


Trains the model.

In [None]:
fitter.fit(train_loader, val_loaders)


2021-06-01T11:28:53.263745
LR: 0.0002
Train Step 0/4006, summary_loss: 0.00000, class_loss: 0.00000, box_loss: 0.00000, time: 0.93653
Train Step 250/4006, summary_loss: 1.31258, class_loss: 0.97609, box_loss: 0.00673, time: 73.90527
Train Step 500/4006, summary_loss: 1.14124, class_loss: 0.82658, box_loss: 0.00629, time: 146.35868
Train Step 750/4006, summary_loss: 1.08365, class_loss: 0.77856, box_loss: 0.00610, time: 219.30628
Train Step 1000/4006, summary_loss: 1.02438, class_loss: 0.72440, box_loss: 0.00600, time: 292.20523
Train Step 1250/4006, summary_loss: 0.99188, class_loss: 0.69935, box_loss: 0.00585, time: 364.74505
Train Step 1500/4006, summary_loss: 0.94573, class_loss: 0.66085, box_loss: 0.00570, time: 437.21034
Train Step 1750/4006, summary_loss: 0.91497, class_loss: 0.63478, box_loss: 0.00560, time: 509.96182
Train Step 2000/4006, summary_loss: 0.89428, class_loss: 0.61661, box_loss: 0.00555, time: 582.93568
Train Step 2250/4006, summary_loss: 0.86929, class_loss: 0.59