In [1]:
from PIL import Image
import json
import numpy as np
import torchvision.transforms.functional as TF
import torch
import torchvision
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from src.detector.utils import binary_mask_iou, mask_convert
from src.tracker.data_track_precomputed import MOT16SequencesPrecomputed
from src.tracker.data_track import MOT16Sequences
from src.utils.torch_utils import dict2keys_and_items
from src.detector.visualize import visualize_detection

from src.detector.object_detector import init_detector


sequences = MOT16Sequences(
        root_dir="data/MOT16",
        dataset="MOT16-train",
        vis_threshold=0.5,
)
sequence = sequences[0]
frame = sequence[10]


In [2]:
config_path = "config/obj_detect/coco_maskrcnn_experiment.json"

with open(config_path, "r") as f:
    config = json.load(f)
obj_detect = init_detector(**config)
obj_detect.eval();
with torch.no_grad():
    det = obj_detect([frame["img"]])[0]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [18]:
from filterpy.kalman import KalmanFilter
kf = KalmanFilter(dim_x=7, dim_z=4) 
kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0],  [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]])
kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]])
kf.Q

array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.]])

In [16]:
import cv2 
cv_filt = cv2.KalmanFilter(7, 4)
cv_filt.errorCovPre

array([[0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [5]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=0.46584*np.ones((4,4)))
df.

Unnamed: 0,0,1,2,3
0,0.47,0.47,0.47,0.47
1,0.47,0.47,0.47,0.47
2,0.47,0.47,0.47,0.47
3,0.47,0.47,0.47,0.47


In [None]:
from src.detector.utils import keypoint_convert
from src.detector.object_detector import body_part_combination
def correct_box_sizes_using_keypoints(
    detection_batch,
    keypoint_score_thresh=0.9,
    scale_factor_width=1.15,
    scale_factor_height=1.15,
):
    for det in detection_batch:
        occlusion_masks = get_occlusion_inside_obj_boxes(boxes=det["boxes"], masks=det["masks"])
        occlusion_direction = get_occlusion_direction(occlusion_masks)

        le_idxs = keypoint_convert(
            body_part_combination["left_extremity"], "name", "idx"
        )
        re_idxs = keypoint_convert(
            body_part_combination["right_extremity"], "name", "idx"
        )
        f_idxs = keypoint_convert(body_part_combination["feet"], "name", "idx")
        xywh = torchvision.ops.box_convert(det["boxes"].clone(), "xyxy", "xywh")

        for obj_idx in range(len(det["keypoints_scores"])):
            found_keypoint_idxs = torch.where(
                det["keypoints_scores"][obj_idx].sigmoid()
                > keypoint_score_thresh
            )[0].tolist()
            if set(f_idxs) == (set(f_idxs) - set(found_keypoint_idxs)) and occlusion_direction["bottom"][obj_idx]:
                xywh[obj_idx, 3] *= scale_factor_height

            if len((set(re_idxs) - set(found_keypoint_idxs))) > 0 and occlusion_direction["left"][obj_idx]:
                xywh[obj_idx, 0] -= (scale_factor_width - 1) * xywh[obj_idx, 2]
                xywh[obj_idx, 2] *= scale_factor_width

            if len((set(le_idxs) - set(found_keypoint_idxs))) > 0 and occlusion_direction["right"][obj_idx]:
                xywh[obj_idx, 2] *= scale_factor_width
        boxes = torchvision.ops.box_convert(xywh, "xywh", "xyxy")
        det["boxes"] = boxes
    return detection_batch

def get_occlusion_inside_obj_boxes(boxes, masks, output_size=(4, 4)):
    iou = torchvision.ops.box_iou(boxes, boxes)
    overlap_pools = []
    for obj_id in range(len(boxes)):
        neighbours = torch.logical_and(iou[obj_id] > 0, iou[obj_id] < 1)
        x,y,w,h = torchvision.ops.box_convert(boxes[[obj_id]], "xyxy", "xywh").squeeze().int()
        mask_crops = TF.crop(masks[neighbours], top=y, left=x, height=h, width=w)
        overlap = (mask_crops > 0).any(dim=0).float()
        overlap_pool = TF.resize(overlap, size=output_size)
        overlap_pools.append(overlap_pool)
    overlap_pools = torch.stack(overlap_pools, dim=0).squeeze(1)
    return overlap_pools

def get_occlusion_direction(occlusion_masks):
    return {
        "left": occlusion_masks[:, 1:3, 0:1].any(dim=1),
        "right": occlusion_masks[:, 1:3, 3:4].any(dim=1),
        "bottom": occlusion_masks[:, 3:4, 1:3].any(dim=2),
        "top": occlusion_masks[:, 0:1, 1:3].any(dim=2)
    }


In [None]:
from src.detector.object_detector import body_part_combination
from src.detector.utils import keypoint_convert, mask_io_min_max, mask_area_nms

In [None]:
# TODO : MANUALLY
# for all boxes:
# if left or right wrist/elbow/shoulder/fuß (probiere any/all) not visible -> widen box by XXX percent in that direction
# if one foot not visible -> widen box by XXX percent in bottom direction


# for low score boxes:
# if left boy parts are expected to be visible (based on mask iou or box iou) but are not, and the object has low score, remove it


# TODO : EXTEND THIS TO A MLP

In [None]:
def extract_box_correction_features_from_det(det):
    """
    for every detection get features for the box corrector

    Arguments
    ---------
    det: Dict[str: Tensor[N, ...]]

    Returns
    -------
    features: [N, num_features]
    """
    boxes = det["boxes"].int()
    masks = det["masks"]
    scores = det["scores"]

    overlap_pools = get_occlusion_inside_obj_boxes(boxes, masks)
    overlap_features = overlap_pools.reshape(len(boxes), -1)

    # keypoint score are good predictor for visibility of keypoint
    keypoint_score_features = det["keypoints_scores"]

    # confidence features: if you want object classification then add these
    # score
    # entropy

    # keep this because all boxes are resized to same size
    areas = torchvision.ops.box_area(boxes).unsqueeze(1)
    _, _, w, h = torchvision.ops.box_convert(boxes, "xyxy", "xywh").T
    ratios = (h/w).unsqueeze(1)
    box_features = torch.cat([areas, ratios], dim=1)

    # relative positions of certain keypoints, we use head (=both ears)
    keypoint_names = ['left_ear', 'right_ear']
    keypoint_idxs = keypoint_convert(keypoint_names, "name", "idx")
    keypoint_pos = det["keypoints"][:, keypoint_idxs, :2] 
    box_min = boxes[:, [0, 1]]
    box_max = boxes[:, [2, 3]]
    rel_keypoint_pos = ((keypoint_pos.permute(1, 0, 2) - box_min) / (box_max - box_min)).permute(1, 0, 2)
    keypoint_pos_features = rel_keypoint_pos.reshape(len(boxes), -1)

    # head position might be useful, because head it says something about box height
    features = torch.cat([overlap_features, keypoint_score_features, keypoint_pos_features, box_features], dim=1)
    return features


In [None]:
# TODO : setze keypoint depection ein um False Positives zu vermeiden
# Sichere Verbesserung, low risk: wenn objekt keinen overlap mit anderen objekten hat und ausreichende größe, muss es mindestens 90% der Keypoints haben
# High Potential, High Risk: analysiere bei welchem objekt welche keypoints sichtbar sind und überleg dir was ...
# z.B. NMS mit köpfen, wenn zwei boxen overlappen, dann schau ob köpfe overlappen

In [None]:
# TODO : ich habe schon ausgetestet wie viele misses wir aufgrund nms haben, aber noch nicht, wie viele misses wir aufgrund core_thresh haben.
# probiere mal nms 100 mit score_thresh: 0 und schau wie viele low score detections wir durch movement / keypoints o.ä. erkennen können. Undzwar ohne Byte. Also aus dem inherenten zustand.