# Import Libraries

In [None]:
# install norfair dependencies
%cd /kaggle/input/norfair031py3/
!pip install commonmark-0.9.1-py2.py3-none-any.whl -f ./ --no-index
!pip install rich-9.13.0-py3-none-any.whl

!mkdir /kaggle/working/tmp
!cp -r /kaggle/input/norfair031py3/filterpy-1.4.5/filterpy-1.4.5/ /kaggle/working/tmp/
%cd /kaggle/working/tmp/filterpy-1.4.5/
!pip install .
!rm -rf /kaggle/working/tmp

# intstall norfair
%cd /kaggle/input/norfair031py3/
!pip install norfair-0.3.1-py3-none-any.whl -f ./ --no-index
%cd /kaggle/working

In [3]:
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import glob
import shutil
import sys
sys.path.append('../input/tensorflow-great-barrier-reef')
import torch
from PIL import Image
import ast

In [4]:
ROOT_DIR  = '/kaggle/input/tensorflow-great-barrier-reef/' # 官方数据路径
# CKPT_PATH = "/kaggle/input/asnorkin-gbreef-yolov5/yolov5l6_2s1_gbr_video_v0_fold2_obj8.0_yanc_strong_noise_enhance_fixedv6_1280_4b10e.pt"
CKPT_PATH = "/kaggle/input/yolov5l6models/yolov5l6.pt" # 权重文件
IMG_SIZE  = 3100 * 1.5 # image尺寸
CONF      = 0.15 # 置信度的阈值
IOU       = 0.3  # IOU阈值
FLIP_TTA  = False # 是否使用自定义flip TTA
AUGMENT   = False # 是否使用yolov5自带的TTA
ENHANCE   = False # 是否使用自定义color TTA

ENSEMBLE = False # 是否多个模型融合
TRACKER = True # 是否启用视频目标追踪（重要）

In [6]:
def get_path(row):
    '''
    读取图片路径
    '''
    row['image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

# 读取 Train Data，并加入图片路径
df = pd.read_csv(f'{ROOT_DIR}/train.csv') # 
df = df.progress_apply(get_path, axis=1)
df['annotations'] = df['annotations'].progress_apply(lambda x: ast.literal_eval(x))
display(df.head(2))

In [8]:

df['num_bbox'] = df['annotations'].progress_apply(lambda x: len(x))
data = (df.num_bbox>0).value_counts()/len(df)*100
print(f"No BBox: {data[0]:0.2f}% | With BBox: {data[1]:0.2f}%")

  0%|          | 0/23501 [00:00<?, ?it/s]

No BBox: 79.07% | With BBox: 20.93%


# Helper

In [9]:
def voc2yolo(bboxes, image_height=720, image_width=1280):
    """
    voc  => [x1, y1, x2, y1]
    yolo => [xmid, ymid, w, h] (normalized)
    """
    
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]/ image_height
    
    w = bboxes[..., 2] - bboxes[..., 0]
    h = bboxes[..., 3] - bboxes[..., 1]
    
    bboxes[..., 0] = bboxes[..., 0] + w/2
    bboxes[..., 1] = bboxes[..., 1] + h/2
    bboxes[..., 2] = w
    bboxes[..., 3] = h
    
    return bboxes

def yolo2voc(bboxes, image_height=720, image_width=1280):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    voc  => [x1, y1, x2, y1]
    
    """ 
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]* image_height
    
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    bboxes[..., [2, 3]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]
    
    return bboxes

def coco2yolo(bboxes, image_height=720, image_width=1280):
    """
    coco => [xmin, ymin, w, h]
    yolo => [xmid, ymid, w, h] (normalized)
    """
    
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    # normolizinig
    bboxes[..., [0, 2]]= bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]]= bboxes[..., [1, 3]]/ image_height
    
    # converstion (xmin, ymin) => (xmid, ymid)
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]/2
    
    return bboxes

def yolo2coco(bboxes, image_height=720, image_width=1280):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    coco => [xmin, ymin, w, h]
    
    """ 
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    # denormalizing
    bboxes[..., [0, 2]]= bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]]= bboxes[..., [1, 3]]* image_height
    
    # converstion (xmid, ymid) => (xmin, ymin) 
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    
    return bboxes

def voc2coco(bboxes, image_height=720, image_width=1280):
    bboxes  = voc2yolo(bboxes, image_height, image_width)
    bboxes  = yolo2coco(bboxes, image_height, image_width)
    return bboxes


def load_image(image_path):
    '''
    读取图片
    '''
    return cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)

def get_bbox(annots):
    '''
    get bboxes list
    '''
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_imgsize(row):
    '''
    get image size
    '''
    row['width'], row['height'] = imagesize.get(row['image_path'])
    return row

np.random.seed(42)

#  [YOLOv5](https://github.com/ultralytics/yolov5/)

In [10]:
!mkdir -p /root/.config/Ultralytics
!cp /kaggle/input/yolov5-font/Arial.ttf /root/.config/Ultralytics/

In [11]:
import sys; sys.path.append('/kaggle/input/weightedboxesfusion/')
from ensemble_boxes import weighted_boxes_fusion

def to_uint8(img):
    '''
    image数值归一化到0-255
    '''
    return np.clip(img, 0, 255).astype(np.uint8)

def channel_stretching(img):
    '''
    image数值归一化到0-1(最大-最小值归一化)
    '''
    I_min = np.min(img)
    I_max = np.max(img)
    I_mean = np.mean(img)
    return (img - I_min) * (1 / max(1, (I_max - I_min)))

def enchance(img):
    '''
    一些图像色彩增强
    '''
    # TO HSV(Hue, Saturation, Value)
    hsv_img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
 
    # Histogram equalisation on the V-channel 垂直直方图均值化 ####
    hsv_img[:, :, 2] = cv2.equalizeHist(hsv_img[:, :, 2])
 
    # CLAHE ####
    h, s, v = hsv_img[:, :, 0], hsv_img[:, :, 1], hsv_img[:, :, 2]
    clahe = cv2.createCLAHE(clipLimit=15.0, tileGridSize=(20,20))
    v = clahe.apply(v)
 
    # HSVStretching
    s = channel_stretching(s)
    v = channel_stretching(v)
 
    # TO RGB
    hsv_img = np.dstack((h, s, v))
    hsv_img = to_uint8(hsv_img)
    out_img = cv2.cvtColor(hsv_img, cv2.COLOR_HSV2RGB)
    out_img = to_uint8(out_img)
 
    # Gamma correction 提升图片整体亮度
    R = 255.0
    out_img = (R * np.power(img.astype(np.uint32) / R, 1.5))
 
    return to_uint8(out_img)


def wbf(predictions_list, imh, imw, iou_thr=0.5):
    '''
    WBF function
    '''
    labels_list = [[0] * len(preds) for preds in predictions_list] # 类别列表
    boxes_list = [p[:, :4] for p in predictions_list] # boxes列表
    confs_list = [p[:, 4] for p in predictions_list] # 置信度列表
    
    # Normalization 归一化boxes
    for i in range(len(boxes_list)):
        boxes_list[i][:, [0, 2]] /= imw
        boxes_list[i][:, [1, 3]] /= imh
        
    # wbf融合出新的boxes和置信度
    boxes, confs, _ = weighted_boxes_fusion(boxes_list, confs_list, labels_list, iou_thr=iou_thr)
    
    # Denormalization 还原归一化boxes
    boxes[:, [0, 2]] *= imw
    boxes[:, [1, 3]] *= imh
    
    return np.hstack((boxes, confs[:, None]))


def load_model(ckpt_path, conf=0.25, iou=0.50):
    '''
    载入训练好的模型
    '''
    model = torch.hub.load('/kaggle/input/yolov5-lib-ds',
                           'custom',
                           path=ckpt_path,
                           source='local',
                           force_reload=True)  # local repo
    model.conf = conf  # NMS confidence threshold
    model.iou  = iou  # NMS IoU threshold
    model.classes = None   # (optional list) filter by class, i.e. = [0, 15, 16] for persons, cats and dogs
    model.multi_label = False  # NMS multiple labels per box
    model.max_det = 1000  # maximum number of detections per image
    return model


class Yolov5Detector:
    '''
    Yolov5检测器
    '''
    def __init__(self, weights, image_size=1280, confthresh=0.25, nmsthresh=0.5, 
                 device="cuda", flip_tta=False, raw=False, enhance=False):
        self.image_size = image_size # 图像尺寸
        self.confthresh = confthresh # 置信度阈值
        self.nmsthresh = nmsthresh # nms阈值
        self.device = device # gpu or cpu
        self.flip_tta = flip_tta # 翻转TTA
        self.raw = raw # 返回原始格式
        self.enhance = enhance # 图像色彩增强
        self.model = load_model(weights, self.confthresh, self.nmsthresh) # 模型
        
    def predict(self, image):
        batch = self.prepare_batch(image) # 图像TTA
        batch_predictions = self.infer(batch) # inference
        batch_predictions = self.postprocess(batch_predictions, image) # 后处理：还原翻转的图片并对翻转的图片进行wbf
        if self.raw:
            return batch_predictions
        
        bboxes, confs = self._format(batch_predictions)
        return bboxes, confs 
        
    def prepare_batch(self, image):
        # enhance
        if self.enhance:
            image = enchance(image)
        
        batch = [image]
        # flip_tta
        if self.flip_tta:
            batch.append(image[:, ::-1])     # hflip
            batch.append(image[::-1])        # vflip
            batch.append(image[::-1, ::-1])  # both
        return batch

    def infer(self, batch):
        batch_results = self.model(batch, size=self.image_size, augment=AUGMENT).pandas().xyxy
        batch_predictions = [p[["xmin", "ymin", "xmax", "ymax", "confidence"]].values for p in batch_results]
        return batch_predictions
        
    def postprocess(self, batch_predictions, image):
        result = batch_predictions[0]
        if self.flip_tta:
            imh, imw = image.shape[:2]
            
            # hflip
            result_hflip = batch_predictions[1]
            result_hflip[:, [0, 2]] = imw - result_hflip[:, [0, 2]]
            
            # vflip
            result_vflip = batch_predictions[2]
            result_vflip[:, [1, 3]] = imh - result_vflip[:, [1, 3]]
            
            # both
            result_both = batch_predictions[3]
            result_both[:, [0, 2]] = imw - result_both[:, [0, 2]]
            result_both[:, [1, 3]] = imh - result_both[:, [1, 3]]
            
            result = wbf([result, result_hflip, result_vflip, result_both], imh, imw)
            
        return result
    
    def _format(self, batch_predictions):
        bboxes, confs = batch_predictions[:, :4], batch_predictions[:, 4]
        bboxes[:, [2, 3]] -= bboxes[:, [0, 1]]
        bboxes = bboxes.astype(int)
        return bboxes, confs
    
    
def format_prediction(bboxes, confs):
    '''
    submission 格式
    '''
    annot = ''
    if len(bboxes)>0:
        for idx in range(len(bboxes)):
            xmin, ymin, w, h = bboxes[idx]
            conf             = confs[idx]
            annot += f'{conf} {xmin} {ymin} {w} {h}'
            annot +=' '
        annot = annot.strip(' ')
    return annot



class EnsembleDetector:
    def __init__(self, models, image_size=1280, nmsthresh=0.5, device="cuda", flip_tta=False, enhance=False):
        self.enhance = enhance
        self.detectors = [
            Yolov5Detector(
                **model, 
                image_size=image_size, 
                nmsthresh=nmsthresh, 
                device=device, 
                flip_tta=flip_tta, 
                raw=True, 
                enhance=False) 
            for model in models]
        
    def predict(self, image):
        imh, imw = image.shape[:2]
        if self.enhance:
            image = enchance(image)

        predictions = [detector.predict(image) for detector in self.detectors]
        print(predictions)
        predictions = wbf(predictions, imh, imw)
        bboxes, confs = self.detectors[0]._format(predictions)
        return bboxes, confs


## Inference|

In [14]:
import greatbarrierreef
env = greatbarrierreef.make_env() # initialize the environment
iter_test = env.iter_test()      # an iterator which loops over the test set and sample submission


## Run Inference on **Test**

In [None]:
# Detector
if ENSEMBLE:
    detector = EnsembleDetector(MODELS, image_size=IMG_SIZE, nmsthresh=IOU, flip_tta=FLIP_TTA, enhance=ENHANCE)
else:
    detector = Yolov5Detector(CKPT_PATH, image_size=IMG_SIZE, confthresh=CONF, nmsthresh=IOU, flip_tta=FLIP_TTA, enhance=ENHANCE)
print(f"Using {type(detector)} detector")

# 视频目标追踪

import cv2
import numpy as np
from norfair import Detection, Tracker
from norfair.tracker import TrackedObject

# Norfair 是一个可定制的轻量级 Python 库，用于实时对象跟踪。换句话来说，它为每一个被检测到的物体在不同的帧中分配了一个唯一的 id，允许你在它们随时间移动的过程中识别它们。
# 有了 Norfair，只需要几行代码就可以为任何探测器添加跟踪功能。“任何探测器”？是的。无论对象的表现形式是什么样的：一个包围盒（4 个坐标），一个单点中心点，人体姿态估计系统的输出，或其他具有一定概率阈值以上变量的关键点的物体。
# https://www.kaggle.com/parapapapam/yolox-inference-tracking-on-cots-lb-0-539

# Helper to convert bbox in format [x_min, y_min, x_max, y_max, score] to norfair.Detection class
def to_norfair(detects, frame_id):
    result = []
    for x_min, y_min, x_max, y_max, score in detects:
        xc, yc = (x_min + x_max) / 2, (y_min + y_max) / 2
        w, h = x_max - x_min, y_max - y_min
        result.append(
            Detection(
                points=np.array([xc, yc]), 
                scores=np.array([score]), 
                data=np.array([w, h, frame_id])
            )
        )
    return result

# Euclidean distance function to match detections on this frame with tracked_objects from previous frames
def euclidean_distance(detection, tracked_object):
    return np.linalg.norm(detection.points - tracked_object.estimate)

tracker = Tracker(
    distance_function=euclidean_distance,  # 欧式距离
    distance_threshold=30, # 最长距离阈值
    # 每个被追踪的object都有一个内部的命中惯性计数器，用来追踪它被检测到的频率，每当它得到一个匹配，这个计数器就会上升，而每当它没有得到匹配，它就会下降。
    hit_inertia_min=3,   # 如果它在一定的帧数内没有得到任何匹配，并且低于这个参数设置的值，这个object就会被销毁。
    hit_inertia_max=6, # hit_inertia_max 定义了这个惯性可以增长到多大，因此定义了一个object在没有被匹配到任何检测之前可以存活多长时间。
    initialization_delay=1, # initialization_delay 决定了物体的命中惯性计数器必须超过hit_inertia_min多少，才会被视为初始化，并被作为一个真正的object返回给用户。
)

# Run
for frame_id, (img, pred_df) in enumerate(tqdm(iter_test)):
    bboxes, confs = detector.predict(img)
    predictions = []
    detects = []
    for i in range(len(bboxes)):
        box = bboxes[i]
        score = confs[i]
        if score < CONF: # 剔除低置信度的boxes
            continue
        x_min = int(box[0])
        y_min = int(box[1])
        x_max = int(box[2])
        y_max = int(box[3])
        detects.append([x_min, y_min, x_max, y_max, score])
        
        bbox_width = x_max - x_min
        bbox_height = y_max - y_min
        
        predictions.append('{:.2f} {} {} {} {}'.format(score, x_min, y_min, bbox_width, bbox_height))
    
    if TRACKER:
        tracked_objects = tracker.update(detections=to_norfair(detects, frame_id))
        for tobj in tracked_objects:
            bbox_width, bbox_height, last_detected_frame_id = tobj.last_detection.data
            if last_detected_frame_id == frame_id:  # Skip objects that were detected on current frame
                continue
            # Add objects that have no detections on current frame to predictions
            xc, yc = tobj.estimate[0] # 获得当前帧的xc和yc
            x_min, y_min = int(round(xc - bbox_width / 2)), int(round(yc - bbox_height / 2))
            score = tobj.last_detection.scores[0]

            predictions.append('{:.2f} {} {} {} {}'.format(score, x_min, y_min, bbox_width, bbox_height))

    pred_df['annotations'] = ' '.join(predictions)    
    env.predict(pred_df)

In [16]:
sub_df = pd.read_csv('submission.csv')
sub_df.head()

Unnamed: 0,index,annotations
0,0,
1,1,
2,2,
