model/utils/creator_tool.py文件
这个脚本实现了三个Creator函数，分别是：ProposalCreator、AnchorTargetCreator、ProposalTargetCreator
前两个都在RPN网络里实现，第三个在RoIHead网络里实现。

In [None]:
class ProposalCreator:
    # 这部分的操作不需要进行反向传播，因此可以利用numpy/tensor实现
    # 对于每张图片，利用它的feature map，计算（H/16）x(W/16)x9(大概20000)个anchor属于前景的概率，
    #然后从中选取概率较大的12000张，利用位置回归参数，修正这12000个anchor的位置， 
    #利用非极大值抑制，选出2000个ROIS以及对应的位置参数。

    """Proposal regions are generated by calling this object.
    The :meth:`__call__` of this object outputs object detection proposals by
    applying estimated bounding box offsets
    to a set of anchors.
    This class takes parameters to control number of bounding boxes to
    pass to NMS and keep after NMS.
    If the paramters are negative, it uses all the bounding boxes supplied
    or keep all the bounding boxes returned by NMS.
    This class is used for Region Proposal Networks introduced in
    Faster R-CNN [#]_.
    .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
    Faster R-CNN: Towards Real-Time Object Detection with \
    Region Proposal Networks. NIPS 2015.
    Args:
        nms_thresh (float): Threshold value used when calling NMS.
        n_train_pre_nms (int): Number of top scored bounding boxes
            to keep before passing to NMS in train mode.
        n_train_post_nms (int): Number of top scored bounding boxes
            to keep after passing to NMS in train mode.
        n_test_pre_nms (int): Number of top scored bounding boxes
            to keep before passing to NMS in test mode.
        n_test_post_nms (int): Number of top scored bounding boxes
            to keep after passing to NMS in test mode.
        force_cpu_nms (bool): If this is :obj:`True`,
            always use NMS in CPU mode. If :obj:`False`,
            the NMS mode is selected based on the type of inputs.
        min_size (int): A paramter to determine the threshold on
            discarding bounding boxes based on their sizes.
    """

    def __init__(self,
                 parent_model,
                 nms_thresh=0.7,
                 n_train_pre_nms=12000,
                 n_train_post_nms=2000,
                 n_test_pre_nms=6000,
                 n_test_post_nms=300,
                 min_size=16
                 ):
        self.parent_model = parent_model
        self.nms_thresh = nms_thresh
        self.n_train_pre_nms = n_train_pre_nms
        self.n_train_post_nms = n_train_post_nms
        self.n_test_pre_nms = n_test_pre_nms
        self.n_test_post_nms = n_test_post_nms
        self.min_size = min_size

    def __call__(self, loc, score,
                 anchor, img_size, scale=1.):
        #这里的loc和score是经过region_proposal_network中经过1x1卷积分类和回归得到的
        if self.parent_model.training:
            n_pre_nms = self.n_train_pre_nms   #NMS之前有12000个
            n_post_nms = self.n_train_post_nms #经过NMS后有2000个
        else:
            n_pre_nms = self.n_test_pre_nms    #6000->300
            n_post_nms = self.n_test_post_nms


        # 把anchors转成proposal，即rois
        roi = loc2bbox(anchor, loc)

        # Clip predicted boxes to image.
        roi[:, slice(0, 4, 2)] = np.clip(
            roi[:, slice(0, 4, 2)], 0, img_size[0])#裁剪将rois的ymin,ymax限定在[0,H]
        roi[:, slice(1, 4, 2)] = np.clip(
            roi[:, slice(1, 4, 2)], 0, img_size[1])#裁剪将rois的xmin,xmax限定在[0,W]


        #去除太小的预测框
        min_size = self.min_size * scale #16
        hs = roi[:, 2] - roi[:, 0] #rois的宽
        ws = roi[:, 3] - roi[:, 1] #rois的长
        keep = np.where((hs >= min_size) & (ws >= min_size))[0] #确保rois的长宽大于最小阈值
        roi = roi[keep, :]
        score = score[keep] #对剩下的ROIs进行打分（根据region_proposal_network中rois的预测前景概率）

        # 对所有的(proposal, score)按打分从大到小排列
        #选择最前面 pre_nms_topN (e.g. 6000)个
        order = score.ravel().argsort()[::-1]
        if n_pre_nms > 0:
            order = order[:n_pre_nms]
        roi = roi[order, :]
        score = score[order]


        #使用NMS，选择after_nms_topN (e.g. 300)个.
        keep = nms(
            torch.from_numpy(roi).cuda(),
            torch.from_numpy(score).cuda(),
            self.nms_thresh)
        if n_post_nms > 0:
            keep = keep[:n_post_nms]
        roi = roi[keep.cpu().numpy()]
        return roi

In [None]:
class AnchorTargetCreator(object):
#作用是生成训练要用的anchor(与对应框iou值最大或者最小的各128个框的坐标和256个label（0或者1）)
#为Faster-RCNN专有的RPN网络提供自我训练的样本，RPN网络正是利用AnchorTargetCreator产生的样本作为数据进行网络的训练和学习的，
#这样产生的预测anchor的类别和位置才更加精确，anchor变成真正的ROIS需要进行位置修正，
#而AnchorTargetCreator产生的带标签的样本就是给RPN网络进行训练学习用哒


    def __init__(self,
                 n_sample=256,
                 pos_iou_thresh=0.7, neg_iou_thresh=0.3,
                 pos_ratio=0.5):
        self.n_sample = n_sample
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh = neg_iou_thresh
        self.pos_ratio = pos_ratio

    def __call__(self, bbox, anchor, img_size):
        """Assign ground truth supervision to sampled subset of anchors.
        Types of input arrays and output arrays are same.
        Here are notations.
        * :math:`S` is the number of anchors.
        * :math:`R` is the number of bounding boxes.
        Args:
            bbox (array): Coordinates of bounding boxes. Its shape is
                :math:`(R, 4)`.
            anchor (array): Coordinates of anchors. Its shape is
                :math:`(S, 4)`.
            img_size (tuple of ints): A tuple :obj:`H, W`, which
                is a tuple of height and width of an image.
        Returns:
            (array, array):
            #NOTE: it's scale not only  offset
            * **loc**: Offsets and scales to match the anchors to \
                the ground truth bounding boxes. Its shape is :math:`(S, 4)`.
            * **label**: Labels of anchors with values \
                :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \
                is :math:`(S,)`.
        """

        img_H, img_W = img_size

        n_anchor = len(anchor)  #一般对应20000个左右anchor
        inside_index = _get_inside_index(anchor, img_H, img_W) #将那些超出图片范围的anchor全部去掉,只保留位于图片内部的序号
        anchor = anchor[inside_index] #保留位于图片内部的anchor
        argmax_ious, label = self._create_label(
            inside_index, anchor, bbox) #筛选出符合条件的正例128个负例128并给它们附上相应的label
        #计算每一个anchor与对应bbox求得iou最大的bbox计算偏移量（注意这里是位于图片内部的每一个）
        loc = bbox2loc(anchor, bbox[argmax_ious]) 
        #将位于图片内部的框的label对应到所有生成的20000个框中（label原本为所有在图片中的框的）
        label = _unmap(label, n_anchor, inside_index, fill=-1)
        #将回归的框对应到所有生成的20000个框中（label原本为所有在图片中的框的）
        loc = _unmap(loc, n_anchor, inside_index, fill=0) 

        return loc, label

    def _create_label(self, inside_index, anchor, bbox):
        # label: 1 is positive, 0 is negative, -1 is dont care
        label = np.empty((len(inside_index),), dtype=np.int32)  #inside_index为所有在图片范围内的anchor序号
        label.fill(-1) #全部填充-1
        #调用_calc_ious（）函数得到每个anchor与哪个bbox的iou最大以及这个iou值、每个bbox与哪个anchor的iou最大
        argmax_ious, max_ious, gt_argmax_ious = \
            self._calc_ious(anchor, bbox, inside_index)

        #把每个anchor与对应的框求得的iou值与负样本阈值比较，若小于负样本阈值，
        #则label设为0，pos_iou_thresh=0.7, neg_iou_thresh=0.3
        label[max_ious < self.neg_iou_thresh] = 0

        #把与每个bbox求得iou值最大的anchor的label设为1
        label[gt_argmax_ious] = 1

        #把每个anchor与对应的框求得的iou值与正样本阈值比较，若大于正样本阈值，则label设为1
        label[max_ious >= self.pos_iou_thresh] = 1

        #按照比例计算出正样本数量，pos_ratio=0.5，n_sample=256
        n_pos = int(self.pos_ratio * self.n_sample)
        pos_index = np.where(label == 1)[0]#得到所有正样本的索引 
        if len(pos_index) > n_pos:
            disable_index = np.random.choice(
                pos_index, size=(len(pos_index) - n_pos), replace=False)
            label[disable_index] = -1  #如果选取出来的正样本数多于预设定的正样本数，则随机抛弃，将那些抛弃的样本的label设为-1

        #设定的负样本的数量
        n_neg = self.n_sample - np.sum(label == 1) 
        neg_index = np.where(label == 0)[0] #负样本的索引
        if len(neg_index) > n_neg:
            disable_index = np.random.choice(
                neg_index, size=(len(neg_index) - n_neg), replace=False)
            label[disable_index] = -1 #随机选择不要的负样本，个数为len(neg_index)-neg_index，label值设为-1

        return argmax_ious, label

    def _calc_ious(self, anchor, bbox, inside_index):
         #调用bbox_iou函数计算anchor与bbox的IOU， ious：（N,K），N为anchor中第N个，K为bbox中第K个，N大概有15000个
        ious = bbox_iou(anchor, bbox)
        argmax_ious = ious.argmax(axis=1)
        #求出每个anchor与哪个bbox的iou最大，以及最大值，max_ious:[1,N]
        max_ious = ious[np.arange(len(inside_index)), argmax_ious] 
        gt_argmax_ious = ious.argmax(axis=0)
        #求出每个bbox与哪个anchor的iou最大，以及最大值,gt_max_ious:[1,K]
        gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
        gt_argmax_ious = np.where(ious == gt_max_ious)[0]  #然后返回最大iou的索引（每个bbox与哪个anchor的iou最大),有K个

        return argmax_ious, max_ious, gt_argmax_ious


*下面是ProposalTargetCreator的代码：*

目的：为2000个rois赋予ground truth！（严格讲挑出128个赋予ground truth！）

输入：2000个rois、一个batch（一张图）中所有的bbox ground truth（R，4）、对应bbox所包含的label（R，1）（VOC2007来说20类0-19）

输出：128个sample roi（128，4）、128个gt_roi_loc（128，4）、128个gt_roi_label（128，1）


In [None]:
class ProposalTargetCreator(object):
    #为2000个rois赋予ground truth！（严格讲挑出128个赋予ground truth！）
    #输入：2000个rois、一个batch（一张图）中所有的bbox ground truth（R，4）、对应bbox所包含的label（R，1）（VOC2007来说20类0-19）
    #输出：128个sample roi（128，4）、128个gt_roi_loc（128，4）、128个gt_roi_label（128，1）

    def __init__(self,
                 n_sample=128,
                 pos_ratio=0.25, pos_iou_thresh=0.5,
                 neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
                 ):
        self.n_sample = n_sample
        self.pos_ratio = pos_ratio
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh_hi = neg_iou_thresh_hi
        self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE:default 0.1 in py-faster-rcnn

    def __call__(self, roi, bbox, label,
                 loc_normalize_mean=(0., 0., 0., 0.),
                 loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
        #因为这些数据是要放入到整个大网络里进行训练的，比如说位置数据，所以要对其位置坐标进行数据增强处理(归一化处理)
        n_bbox, _ = bbox.shape
        n_bbox, _ = bbox.shape

        roi = np.concatenate((roi, bbox), axis=0) #首先将2000个roi和m个bbox给concatenate了一下成为新的roi（2000+m，4）。
         #n_sample = 128,pos_ratio=0.5，round 对传入的数据进行四舍五入
        pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
        iou = bbox_iou(roi, bbox) #计算每一个roi与每一个bbox的iou
        #按行找到最大值，返回最大值对应的序号以及其真正的IOU。
        gt_assignment = iou.argmax(axis=1) 
        max_iou = iou.max(axis=1) #每个roi与对应bbox最大的iou
        # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
        #  0 是背景.
        gt_roi_label = label[gt_assignment] + 1 #从1开始的类别序号，给每个类得到真正的label(将0-19变为1-20)

        #根据iou的最大值将正负样本找出来，pos_iou_thresh=0.5
        pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] 
        #需要保留的roi个数（满足大于pos_iou_thresh条件的roi与64之间较小的一个）
        pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
        if pos_index.size > 0:
            pos_index = np.random.choice(
                pos_index, size=pos_roi_per_this_image, replace=False) #找出的样本数目过多就随机丢掉一些

 
        #负样本的ROI区间 [neg_iou_thresh_lo, neg_iou_thresh_hi)
        #neg_iou_thresh_hi=0.5，neg_iou_thresh_lo=0.0
        neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
                             (max_iou >= self.neg_iou_thresh_lo))[0]
        #需要保留的roi个数（满足大于0小于neg_iou_thresh_hi条件的roi与64之间较小的一个）
        neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image 
        neg_roi_per_this_image = int(min(neg_roi_per_this_image,
                                         neg_index.size))
        if neg_index.size > 0:
            neg_index = np.random.choice(
                neg_index, size=neg_roi_per_this_image, replace=False) #找出的样本数目过多就随机丢掉一些

        # 综合下找到的正负样本的index
        keep_index = np.append(pos_index, neg_index)
        gt_roi_label = gt_roi_label[keep_index]
        gt_roi_label[pos_roi_per_this_image:] = 0  # 负样本label 设为0
        sample_roi = roi[keep_index]

        #那么此时输出的128*4的sample_roi就可以去扔到 RoIHead网络里去进行分类与回归了。
        #同样， RoIHead网络利用这sample_roi+featue为输入，输出是分类（21类）和回归（进一步微调bbox）的预测值，
        #那么分类回归的groud truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。
        gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
        gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
                       ) / np.array(loc_normalize_std, np.float32))
        #ProposalTargetCreator首次用到了真实的21个类的label,且该类最后对loc进行了归一化处理，所以预测时要进行均值方差处理

        return sample_roi, gt_roi_loc, gt_roi_label
