## **Load Packages and Set Environment Variables**

In [None]:
import numpy as np
import pandas as pd

import os
import cv2
import itertools
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import zipfile
import xml.etree.ElementTree as ET
from torch.utils.data.sampler import SequentialSampler
from functools import reduce


DRIVE = "/content/drive/MyDrive/Larch"
IMAGE_ZIP = "Data_Set_Larch_Casebearer.zip"
IMAGES = "Data_Set_Larch_Casebearer"
os.environ["DRIVE"] = DRIVE
os.environ["DRIVE_ZIP"] = f"{DRIVE}/{IMAGE_ZIP}"
os.environ["IMAGE_ZIP"] = IMAGE_ZIP
os.environ["IMAGES"] = IMAGES

IMG_SIZE = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
BATCH_SIZE = 8

## **Load Data and Install Petrel**

In [None]:
%%bash
wget https://lilablobssc.blob.core.windows.net/larch-casebearer/$IMAGE_ZIP
unzip -q /content/$IMAGE_ZIP
rm /content/$IMAGE_ZIP

pip install -U -q albumentations
pip install -q petrel-det

In [None]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from petrel.dataset import TrainDataset, ValDataset, BBOX
from petrel.model import load_edet
from petrel.predict import val_prediction_df, model_eval

# **Process XML metadata to DataFrame**

In [None]:
def xml_to_df(d, directory, filename):
    xml_split = filename.split('/')
    file = xml_split[-1].split('.')[0]
    location, date = directory.split('_')
    root = ET.XML(d)
    objects = []
    for child in root:
        if child.tag == 'object':
            objects.append(child)
        elif child.tag == 'size':
            dims = {s.tag: int(s.text) for s in child}
    
    datas = []
    for child in objects:
        data ={}
        for c in child:
            if c.tag != 'bndbox':
                data[c.tag] = [c.text]
            else:
                for b in c:
                    data[b.tag] = [int(b.text)]
        datas.append(pd.DataFrame(data))
    try:
        df = pd.concat(datas)
        df['height'], df['width'] = dims['height'], dims['width']
        df['location'], df['date'], df['file_name'] = location, int(date), file
    except:
        print(filename)
        df = pd.DataFrame({'height': dims['height'],
                           'width': dims['width'],
                           'location': [location],
                           'date': [int(date)],
                           'file_name': [file]})
    df["file"] = df.apply(lambda row: f"{row['location']}_{row['date']}/Images/{row['file_name']}.JPG", axis=1)
    
    return df

def read_zip(zf):
    z = zipfile.ZipFile(f"{IMAGES}/{zf}/Annotations.zip", "r")
    df_list = []
    for filename in [f for  f in z.namelist() if "__" not in f and ".xml" in f]:
        with z.open(filename) as f:
            d = f.read() 
            df_list.append(xml_to_df(d, zf, filename))
    
    return pd.concat(df_list).reset_index(drop=True)

In [None]:
%%time
def get_meta_data():
  larch_dirs = [f for f in os.listdir(IMAGES)]
  image_cols = ["location", "file", "file_name", "height", "width"]
  box_cols = ["tree", "damage", "labels", "xmin", "ymin", "xmax", "ymax", "file"]

  ## Concatenate Dataframes from each location into single Dataframe.
  xml_df = pd.concat([read_xml(ld) for ld in larch_dirs]).reset_index(drop=True)
  xml_df['truncated'] = xml_df['truncated'].astype(float)
  
  ## Standardize column names.
  xml_df.loc[xml_df[~xml_df['name'].isna()].index, 'tree'] = xml_df.loc[xml_df[~xml_df['name'].isna()].index, 'name']

  ## Drop redundant columns.
  xml_df.drop(columns=['name', 'difficult', 'pose'], inplace=True)

  ## Standardize label for detection of other tree species.
  xml_df.loc[xml_df[xml_df['tree'].isna()].index, 'tree'] = 'Other'
  xml_df['tree'] = xml_df['tree'].apply(lambda t: t.capitalize()).apply(lambda t: t if t != 'Spruce' else 'Other')

  ## Remove detections with no damage information.
  xml_df = xml_df[~xml_df['damage'].isnull()].reset_index(drop=True)
  xml_df['truncated'] = xml_df['truncated'].astype(int)
  for col in ["truncated", "xmin", "xmax", "ymin", "ymax"]:
    xml_df[col] = xml_df[col].astype(int)
  damage_map = {d: n + 1 for n, d in enumerate(xml_df["damage"].sort_values().unique())}
  xml_df["labels"] = xml_df["damage"].apply(lambda d: damage_map[d])
  xml_df = xml_df[xml_df['file_name'] != "B01_0023"]

  ## Drop duplicate entries.
  xml_df = xml_df.drop_duplicates().reset_index(drop=True)

  ## Remove August Data.
  xml_images = xml_df[xml_df['date'] == 20190527][image_cols].drop_duplicates().reset_index(drop=True)
  xml_boxes = xml_df[xml_df['date'] == 20190527][box_cols].reset_index(drop=True)
  xml_boxes = xml_boxes[xml_boxes["xmin"] != xml_boxes["xmax"]].reset_index(drop=True)

  # Train-Val Split
  xml_train, xml_val = train_test_split(xml_images,
                                        test_size=0.2,
                                        random_state=64,
                                        stratify=xml_images['location'])
  xml_train.reset_index(drop=True, inplace=True)
  xml_val.reset_index(drop=True, inplace=True)
  xml_train_boxes = xml_boxes[xml_boxes["file"].isin(xml_train["file"])].reset_index(drop=True)
  xml_val_boxes = xml_boxes[xml_boxes["file"].isin(xml_val["file"])].reset_index(drop=True)
  return xml_train, xml_val, xml_train_boxes, xml_val_boxes

xml_train, xml_val, xml_train_boxes, xml_val_boxes = get_meta_data()

Annotations/B04_0181.xml
Annotations/B02_0230.xml
Annotations/B10_0221.xml
CPU times: user 1min 46s, sys: 1.21 s, total: 1min 47s
Wall time: 1min 45s


**Full Evaluation**

In [None]:
def get_val_full_transform(d_size):
    """
    Returns a function to perform the standard sequence of preprocessing steps
    for validation data.
    """
    return A.Compose([A.Resize(height=IMG_SIZE[d_size],
                               width=IMG_SIZE[d_size],
                               p=1.0),
                      ToTensorV2(p=1.0)],
                     bbox_params=BBOX,
                     p=1.0)
    
def collate_fn(batch):
    return tuple(zip(*batch))

## EDet 2

In [None]:
val_full_dataset = ValDataset(
    meta_data=xml_val,
    boxes=xml_val_boxes,
    image_root="/content",
    transform=get_val_full_transform(d_size=2),
    train_pipe=False
)

val_full_loader = torch.utils.data.DataLoader(
    val_full_dataset, 
    batch_size=BATCH_SIZE,
    sampler=SequentialSampler(val_full_dataset),
    shuffle=False,
    pin_memory=False,
    collate_fn=collate_fn)

eval_full = {}
for epoch in [154]:
  model = load_edet(
      "tf_efficientdet_d2", image_size=IMG_SIZE[2],
      num_classes=4,
      checkpoint_path=f"{DRIVE}/effdet2_petrel_full_cosine_200/best-checkpoint-{str(epoch).zfill(2)}epoch.bin",
      train=False)
  print(epoch)
  pred_df = val_prediction_df(model, val_full_loader, verbose=20)

154
Processed batch 20.


In [None]:
pred_df["pred_boxes"] = pred_df.apply(lambda row: row["pred_boxes"][row['pred_labels'] > 0], axis=1)
pred_df["pred_scores"] = pred_df.apply(lambda row: row["pred_scores"][row['pred_labels'] > 0], axis=1)
pred_df["pred_labels"] = pred_df.apply(lambda row: row["pred_labels"][row['pred_labels'] > 0], axis=1)

In [None]:
pred_df

Unnamed: 0,pred_boxes,pred_scores,pred_labels,gt_boxes,gt_labels
0,"[[347.2126, 25.480986, 414.3955, 87.62799], [2...","[0.840237, 0.8137625, 0.8127692, 0.8107154, 0....","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[26.112001419067383, 44.03199863433838, 54.78...","[3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,"[[227.62556, 468.48264, 307.1623, 550.1701], [...","[0.9149391, 0.9132424, 0.9071041, 0.9014337, 0...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[29.695999145507812, 11.263999700546265, 81.9...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,"[[417.01636, 242.99649, 698.2843, 541.661], [2...","[0.8905087, 0.8887571, 0.79094094, 0.70513016,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[[243.1999969482422, 0.5119999945163727, 509.4...","[4, 4, 4, 4, 4, 4, 4]"
3,"[[147.08669, 74.76472, 250.2093, 188.94269], [...","[0.92479056, 0.92397803, 0.92279065, 0.9064897...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[145.40800094604492, 67.58399963378906, 252.4...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,"[[49.160824, 82.15808, 142.83504, 191.55923], ...","[0.89564353, 0.8844055, 0.8730956, 0.8651428, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, ...","[[50.6879997253418, 17.407999992370605, 119.80...","[3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, ..."
...,...,...,...,...,...
162,"[[236.68253, 167.66135, 491.85867, 471.1603], ...","[0.8698598, 0.83171916, 0.7765003, 0.7657944, ...","[4, 4, 3, 4, 4, 2, 3, 4, 3, 4, 4, 4, 4, 3, 4, ...","[[503.8080139160156, 436.73597717285156, 557.5...","[3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, ..."
163,"[[319.8416, 30.736664, 391.6198, 108.08542], [...","[0.86036336, 0.85336065, 0.84100986, 0.8402835...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, ...","[[38.91200065612793, 25.600001335144043, 176.1...","[3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, ..."
164,"[[175.78995, 366.05676, 247.63577, 447.5921], ...","[0.8858594, 0.8790438, 0.878078, 0.86362934, 0...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[2.5600000619888306, 69.63199996948242, 61.43...","[2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 2, ..."
165,"[[157.2009, 433.88843, 322.30933, 596.12463], ...","[0.88085663, 0.8777154, 0.87569296, 0.8325466,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[[184.31999588012695, 49.15200233459473, 296.9...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


In [None]:
pred_df.to_pickle(f"{DRIVE}/edet2.pickle")

## **EDet Batch Size 8**

# EDet 1 

In [None]:
val_full_dataset = ValDataset(
    meta_data=xml_val,
    boxes=xml_val_boxes,
    image_root="./Data_Set_Larch_Casebearer",
    transform=get_val_full_transform(d_size=1),
    train_pipe=False
)

val_full_loader = torch.utils.data.DataLoader(
    val_full_dataset, 
    batch_size=BATCH_SIZE,
    sampler=SequentialSampler(val_full_dataset),
    shuffle=False,
    pin_memory=False,
    collate_fn=collate_fn)

eval_full = {}
for epoch in [145]:
  model = load_edet(
      "tf_efficientdet_d1", image_size=640,
      num_classes=4,
      checkpoint_path=f"{DRIVE}/effdet1_petrel_full_cosine_200/best-checkpoint-{str(epoch).zfill(2)}epoch.bin",
      train=False)
  print(epoch)
  pred_df = val_prediction_df(model, val_full_loader, verbose=20)

145
Processed batch 20.


In [None]:
pred_df["pred_boxes"] = pred_df.apply(lambda row: row["pred_boxes"][row['pred_labels'] > 0], axis=1)
pred_df["pred_scores"] = pred_df.apply(lambda row: row["pred_scores"][row['pred_labels'] > 0], axis=1)
pred_df["pred_labels"] = pred_df.apply(lambda row: row["pred_labels"][row['pred_labels'] > 0], axis=1)

In [None]:
pred_df

Unnamed: 0,pred_boxes,pred_scores,pred_labels,gt_boxes,gt_labels
0,"[[381.78207, 533.16, 437.9205, 589.5793], [288...","[0.88714236, 0.8641118, 0.8405755, 0.8299885, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[21.760001182556152, 36.69333219528198, 45.65...","[3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,"[[270.80435, 229.7352, 323.89914, 283.92343], ...","[0.92061824, 0.9177831, 0.90459746, 0.90026164...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[24.746665954589844, 9.386666417121887, 68.26...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,"[[202.11801, 1.1943512, 428.09872, 196.54178],...","[0.93474686, 0.89058274, 0.8375146, 0.7383971,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[[202.66666412353516, 0.42666666209697723, 424...","[4, 4, 4, 4, 4, 4, 4]"
3,"[[123.57109, 61.38667, 210.41475, 158.47493], ...","[0.9269969, 0.91418666, 0.90913403, 0.8909985,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[121.1733341217041, 56.31999969482422, 210.34...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,"[[82.762726, 329.84338, 177.0611, 421.36676], ...","[0.9022251, 0.874028, 0.8621113, 0.8469365, 0....","[3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, ...","[[42.239999771118164, 14.506666660308838, 99.8...","[3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, ..."
...,...,...,...,...,...
162,"[[196.37404, 135.70796, 409.1139, 389.78125], ...","[0.8792742, 0.8553043, 0.80444366, 0.79823464,...","[4, 3, 4, 4, 4, 3, 4, 2, 4, 4, 4, 3, 3, 4, 2, ...","[[419.8400115966797, 363.94664764404297, 464.6...","[3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, ..."
163,"[[451.99072, 256.76328, 541.7052, 362.98416], ...","[0.9065474, 0.858513, 0.83216685, 0.8110516, 0...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, ...","[[32.42666721343994, 21.33333444595337, 146.77...","[3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, ..."
164,"[[203.64445, 300.8723, 264.547, 365.46527], [2...","[0.88799345, 0.8834328, 0.8812995, 0.86051154,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[[2.133333384990692, 58.02666664123535, 51.199...","[2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 2, ..."
165,"[[128.87914, 364.47302, 270.39313, 496.50446],...","[0.90265274, 0.89424545, 0.8937382, 0.8601107,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[[153.59999656677246, 40.960001945495605, 247....","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


In [None]:
pred_df.to_pickle(f"{DRIVE}/edet1.pickle")

# EDet 0

In [None]:
val_full_dataset = ValDataset(
    meta_data=xml_val,
    boxes=xml_val_boxes,
    image_root="./Data_Set_Larch_Casebearer",
    transform=get_val_full_transform(d_size=0),
    train_pipe=False
)

val_full_loader = torch.utils.data.DataLoader(
    val_full_dataset, 
    batch_size=BATCH_SIZE,
    sampler=SequentialSampler(val_full_dataset),
    shuffle=False,
    pin_memory=False,
    collate_fn=collate_fn)

eval_full = {}
for epoch in [172]:
  model = load_edet(
      "tf_efficientdet_d0", image_size=512,
      num_classes=4,
      checkpoint_path=f"{DRIVE}/effdet0_petrel_full_cosine_200/best-checkpoint-{str(epoch).zfill(2)}epoch.bin",
      train=False)
  print(epoch)
  pred_df = val_prediction_df(model, val_full_loader, verbose=20)
pred_df["pred_boxes"] = pred_df.apply(lambda row: row["pred_boxes"][row['pred_labels'] > 0], axis=1)
pred_df["pred_scores"] = pred_df.apply(lambda row: row["pred_scores"][row['pred_labels'] > 0], axis=1)
pred_df["pred_labels"] = pred_df.apply(lambda row: row["pred_labels"][row['pred_labels'] > 0], axis=1)

172
Processed batch 20.


In [None]:
pred_df.to_pickle(f"{DRIVE}/edet0.pickle")