# 搭建Fast-RCNN检测网络

设置好参数

In [None]:
    ROI_SIZE = 7   #ROI（Region of Interest）7×7 大小是指在 RoI Pooling 或 RoI Align 层中，
                    #对提取的特征图上的每个候选区域进行划分的固定尺寸
    FAST_RCNN_NMS_IOU_THRESHOLD = 0.3 #表示进行非极大值抑制 (NMS) 时，用于确定是否合并边界框的 IoU 阈值。
    FINAL_SCORE_THRESHOLD = 0.7  #表示在 Fast R-CNN 阶段，用于筛选预测边界框的得分阈值。得分低于该阈值的边界框将被丢弃。
    FAST_RCNN_IOU_POSITIVE_THRESHOLD = 0.5  #判断一个 proposal 是否为正样本的 IoU 阈值
    FAST_RCNN_MINIBATCH_SIZE = 200  #Fast R-CNN 阶段中用于训练的每个 mini-batch 的大小。
    FAST_RCNN_POSITIVE_RATE = 0.33  #Fast R-CNN 阶段中正样本的比例。该比例控制 mini-batch 中正样本与负样本的数量比例。
    DETECTION_MAX_INSTANCES = 200  #在生成最终检测结果时，每张图片允许的最大检测实例数。

## 初始化传入Fast-RCNN的参数

In [None]:
class FastRCNN(object):
    def __init__(self,
                 feature_pyramid,  #传入的金字塔特征
                 rpn_proposals_boxes,#rpn产生的区域建议
                 gtboxes_and_label,  # [batch_size, M, 5]
                 #M表示真实目标的个数、5表示四个坐标加上类别
                 origin_image,     #原始图像
                 reference_feature,  #参考图像的特征 用于计算特征
                 config,        #配置文件
                 is_training,
                 image_window):

        self.feature_pyramid = feature_pyramid
        self.rpn_proposals_boxes = rpn_proposals_boxes  # [batch_size, N, 4]
        self.gtboxes_and_label = gtboxes_and_label
        self.origin_image = origin_image
        self.reference_feature = reference_feature
        self.config = config
        self.IS_TRAINING = is_training
        self.window = image_window
        self.level = config.LEVEL    #特征金字塔的层级
        self.min_level = int(self.level[0][1])  #获得最小层级的编号
        self.max_level = min(int(self.level[-1][1]), 5)  #获得最大层级的编号

 将输入的张量列表进行重塑，以合并批次维度和框维度，并返回一个新的张量列表

In [None]:
    def merge_batch_and_bboxes_dims(self, inputs):
        """
        :param inputs:list of tensor
        :return: list of tensor
        """
        outputs = []
        for input in inputs:
            input_shape = input.get_shape().as_list()
            output =tf.reshape(input, [-1,] + input_shape[2:])  #保留第三个维度开始
            outputs.append(output)
        if len(inputs) == 1:
            return outputs[0]
        return outputs

调整张量的维度 以适应模型的输入

In [None]:
    def div_batch_and_bboxes_dims(self, inputs):
        """
        :param inputs:list of tensor
        :return: list of tensor
        """
        outputs = []
        for input in inputs:
            input_shape = input.get_shape().as_list()  #保存其形状信息
            output = tf.reshape(input, [self.config.PER_GPU_IMAGE, -1,] + input_shape[1:])
            outputs.append(output)
        if len(inputs) == 1:
            return outputs[0]
        return outputs

## 构建 Fast R-CNN 的训练目标

In [None]:
    @property
    def build_frcnn_target(self):
        '''
        when training, we should know each reference box's label and gtbox,
        in second stage
        iou >= 0.5 is object
        iou < 0.5 is background
        this function need batch_slice
        :return:
        minibatch_reference_proboxes: (batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, 4)[y1, x1, y2, x2]
        minibatch_encode_gtboxes:(batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, 4)[dy, dx, log(dh), log(dw)]
        object_mask:(batch_szie, config.FAST_RCNN_MINIBATCH_SIZE) 1 indicate is object, 0 indicate is not objects
        label_one_hot: (batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, num_class)
        '''
        #输入真实的标注框的坐标和标签、RPN的建议框、配置参数
        def batch_slice_build_target(gtboxes_and_label, rpn_proposals_boxes, config):

            with tf.variable_scope('build_faster_rcnn_targets'):
                
                #对数据进行预处理操作 得到正负样本
                with tf.variable_scope('fast_rcnn_find_positive_negative_samples'):
                    #分离出边界框的坐标和对应的类别标签
                    gtboxes = tf.cast(
                        tf.reshape(gtboxes_and_label[:, :-1], [-1, 4]), tf.float32)
                    gt_class_ids = tf.cast(
                        tf.reshape(gtboxes_and_label[:, -1], [-1, ]), tf.int32)
                    gtboxes, non_zeros = boxes_utils.trim_zeros_graph(gtboxes, name="trim_gt_box")  # [M, 4]  #移除面积为0的框
                    gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros)  #取出不为0的类别id
                    rpn_proposals_boxes, _ = boxes_utils.trim_zeros_graph(rpn_proposals_boxes,  #对RPN预测框移除面积为0
                                                                          name="trim_rpn_proposal_train")

                    ious = boxes_utils.iou_calculate(rpn_proposals_boxes, gtboxes)  # [N, M]  计算预测框和标签的iou值
                    matchs = tf.cast(tf.argmax(ious, axis=1), tf.int32)  # [N, ]  得到 每个预测框对应标注框iou最大的值的索引
                    max_iou_each_row = tf.reduce_max(ious, axis=1)  #得到每行的最大值
                    #生成正样本 IOU高于0.5的
                    positives = tf.cast(tf.greater_equal(max_iou_each_row, config.FAST_RCNN_IOU_POSITIVE_THRESHOLD), tf.int32)
                    #根据索引match 提取标注框和类别                
                    reference_boxes_mattached_gtboxes = tf.gather(gtboxes, matchs)  # [N, 4]
                    gt_class_ids = tf.gather(gt_class_ids, matchs)  # [N, ]
                    #将正样本转换成浮点型张量
                    object_mask = tf.cast(positives, tf.float32)  # [N, ]
                    # when box is background, not caculate gradient, so give a weight 0 to avoid caculate gradient
                    gt_class_ids = gt_class_ids * positives
                
                #从正样本和负样本中选择一定数量的样本用于Fast-RCNN的训练
                with tf.variable_scope('fast_rcnn_minibatch'):
                    # choose the positive indices
                    positive_indices = tf.reshape(tf.where(tf.not_equal(object_mask, 0.)), [-1])
                    num_of_positives = tf.minimum(tf.shape(positive_indices)[0],
                                                  tf.cast(config.FAST_RCNN_MINIBATCH_SIZE*config.FAST_RCNN_POSITIVE_RATE,
                                                          tf.int32))
                    positive_indices = tf.random_shuffle(positive_indices)
                    positive_indices = tf.slice(positive_indices, begin=[0], size=[num_of_positives])
                    # choose the negative indices,
                    # Strictly propose the proportion of positive and negative is 1:3
                    negative_indices = tf.reshape(tf.where(tf.equal(object_mask, 0.)), [-1])
                    num_of_negatives = tf.cast(int(1. / config.FAST_RCNN_POSITIVE_RATE) * num_of_positives, tf.int32)\
                                       - num_of_positives

                    num_of_negatives = tf.minimum(tf.shape(negative_indices)[0], num_of_negatives)
                    negative_indices = tf.random_shuffle(negative_indices)
                    negative_indices = tf.slice(negative_indices, begin=[0], size=[num_of_negatives])

                    #将正样本索引和负样本索引按顺序合并成一个索引列表 minibatch_indices
                    minibatch_indices = tf.concat([positive_indices, negative_indices], axis=0)
                    
                    #根据索引得到真实框和预测框
                    minibatch_reference_gtboxes = tf.gather(reference_boxes_mattached_gtboxes,
                                                            minibatch_indices)
                    minibatch_reference_proboxes = tf.gather(rpn_proposals_boxes, minibatch_indices)
                    # encode gtboxes
                    minibatch_encode_gtboxes = \
                        encode_and_decode.encode_boxes(
                            unencode_boxes=minibatch_reference_gtboxes,
                            reference_boxes=minibatch_reference_proboxes,
                            dev_factors=config.BBOX_STD_DEV)
                    #根据索引得到物体掩码和类别
                    object_mask = tf.gather(object_mask, minibatch_indices)
                    gt_class_ids = tf.gather(gt_class_ids, minibatch_indices)

                    # padding if necessary  根据参数提供的FAST_RCNN_MINIBATCH_SIZE，如果样本数不够 来补足样本
                    gap = tf.cast(config.FAST_RCNN_MINIBATCH_SIZE - (num_of_positives + num_of_negatives), dtype=tf.int32)
                    bbox_padding = tf.zeros((gap, 4))
                    minibatch_reference_proboxes = tf.concat([minibatch_reference_proboxes, bbox_padding], axis=0)
                    minibatch_encode_gtboxes = tf.concat([minibatch_encode_gtboxes, bbox_padding], axis=0)
                    object_mask = tf.pad(object_mask, [(0, gap)])
                    gt_class_ids = tf.pad(gt_class_ids, [(0, gap)])

                return minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids

            #将输入的标注框和候选框数据传递给每个 GPU，通过 batch_slice_build_target 函数构建 Fast R-CNN 的训练目标，
            #得到每个 GPU 上的 mini-batch 训练数据，包括候选框、编码后的真实框、正负样本掩码和类别ID
        minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids = \
                boxes_utils.batch_slice([self.gtboxes_and_label, self.rpn_proposals_boxes],
                                        lambda x, y: batch_slice_build_target(x, y, self.config),
                                        self.config.PER_GPU_IMAGE)
        if DEBUG:
            gt_vision = draw_boxes_with_categories(self.origin_image[0],
                                                   self.gtboxes_and_label[0, :, :4],
                                                   self.gtboxes_and_label[0, :, 4])
            tf.summary.image("gt_vision", gt_vision)

            draw_bbox_train = draw_boxes_with_categories(self.origin_image[0],
                                                         minibatch_reference_proboxes[0],
                                                         gt_class_ids[0])
            tf.summary.image("positive_proposal", draw_bbox_train)

        return minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids

为 Fast R-CNN 阶段的每个候选框分配一个级别，该级别将用于选择特征金字塔中相应级别的特征图来进行进一步的处理和预测

In [None]:
    def assign_level(self, minibatch_reference_proboxes):
        """
        compute the level of rpn_proposals_boxes
        :param: minibatch_reference_proboxes (batch_size, num_proposals, 4)[y1, x1, y2, x2]
        return: (batch_size, num_proposals)
        Note that we have not trim the elements padding is 0 which does not affect the finial result.
        """
        with tf.name_scope('assign_levels'):
            ymin, xmin, ymax, xmax = tf.unstack(minibatch_reference_proboxes, axis=2)

            w = tf.maximum(xmax - xmin, 0.)  # avoid w is negative
            h = tf.maximum(ymax - ymin, 0.)  # avoid h is negative

            levels = tf.round(4. + tf.log(tf.sqrt(w*h + 1e-8)/224.0) / tf.log(2.))  # 4 + log_2(***)

            levels = tf.maximum(levels, tf.ones_like(levels) * (np.float32(self.min_level)))  # level minimum is 2
            levels = tf.minimum(levels, tf.ones_like(levels) * (np.float32(self.max_level)))  # level maximum is 5

            return tf.cast(levels, tf.int32)

### 从特征图中提取候选框的特征，并将这些特征整理为适合进入后续网络层处理的形式

In [None]:
    def get_rois(self, proposal_bbox):
        '''
        1)get roi from feature map
        2)roi align or roi pooling. Here is roi align
        :param: proposal_bbox: (batch_size, num_proposal, 4)[y1, x1, y2, x2]
        :return:
        all_level_rois: [batch_size, num_proposal, 7, 7, C]
        '''
        #首先调用aasign_level对预测框分配一个级别
        levels = self.assign_level(proposal_bbox)

        with tf.variable_scope('fast_rcnn_roi'):
            pooled = []  #用于存储从不同级别的特征图中提取的候选框的特征
            # this is aimed at reorder the pooling map (batch_size, num_proposal)
            box_to_level = [] #存储每个候选框的索引以及其所属的级别信息
            
            for i in range(self.min_level, self.max_level + 1):
                #根据级别取出索引和候选框
                ix = tf.where(tf.equal(levels, i))
                level_i_proposals = tf.gather_nd(proposal_bbox, ix)

                # Box indicies for crop_and_resize.
                box_indices = tf.cast(ix[:, 0], tf.int32) #候选框所属的批次索引

                box_to_level.append(ix)

                level_i_proposals = tf.stop_gradient(level_i_proposals)   #停止梯度计算 只计算特征
                box_indices = tf.stop_gradient(box_indices)
                
                #根据目标尺寸大小创建图片张量
                image_shape = tf.constant([self.config.TARGET_SIDE-1, self.config.TARGET_SIDE-1,
                                           self.config.TARGET_SIDE-1, self.config.TARGET_SIDE-1], dtype=tf.float32)
                #对候选框坐标进行归一化操作
                normal_level_i_proposals = level_i_proposals / image_shape
                
                #从特征金字塔的不同层上提取与每个候选框相关的特征区域
                level_i_cropped_rois = tf.image.crop_and_resize(self.feature_pyramid['P%d' % i],
                                                                boxes=normal_level_i_proposals,
                                                                box_ind=box_indices,
                                                                crop_size=[self.config.ROI_SIZE, self.config.ROI_SIZE])
                pooled.append(level_i_cropped_rois)

            # Pack pooled features into one tensor 将不同的特征合并成一个张量
            pooled = tf.concat(pooled, axis=0)
            box_to_level = tf.concat(box_to_level, axis=0)
            box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)  #添加一个维度表示索引范围的张量
            box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],  #将两个张量连接在一起
                                     axis=1)
            
            #我们需要对池化后的特征进行重新排序，以便它们与原始框的顺序相匹配
            # Rearrange pooled features to match the order of the original boxes
            # Sort box_to_level by batch then box index
            # TF doesn't have a way to sort by two columns, so merge them and sort.
            sorting_tensor = box_to_level[:, 0] * 10000 + box_to_level[:, 1]
            ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
                box_to_level)[0]).indices[::-1]
            ix = tf.gather(box_to_level[:, 2], ix)
            pooled = tf.gather(pooled, ix)
            reshape_pooled = self.div_batch_and_bboxes_dims([pooled])
            return reshape_pooled

### 通过提取特定区域的特征并计算类别距离，预测了每个提议框的类别分数和边界框编码

In [None]:
    def fast_rcnn_net(self, features, is_training):
        """
        base the feature to compute the finial bbox and scores
        :param reference_feature:(C-1, 7, 7,256) the feature of reference image
        :param features:(batch_size, num_proposal, 7, 7, channels)
        :return:
        fast_rcnn_encode_boxes: (batch_size, num_proposal, num_classes*4)
        fast_rcnn_scores:(batch_size, num_proposal, num_classes)
        """

        def batch_slice_fast_rcnn_net(features, reference_feature, config, is_training):

            #定义fast-rcnn网络作用域
            with tf.variable_scope('fast_rcnn_net', reuse=tf.AUTO_REUSE):
                #初始化全连接层 包括激活函数和权重初始化
                with slim.arg_scope([slim.fully_connected],
                                    activation_fn=None,
                                    weights_initializer=tf.glorot_uniform_initializer(),
                                    weights_regularizer=slim.l2_regularizer(config.WEIGHT_DECAY)):
                    
                    #归一化层的参数设置
                    batch_norm_params = {
                        'is_training': is_training,
                        'decay': 0.997,
                        'epsilon': 1e-5,
                        'scale': True,
                        'trainable': True,
                        'updates_collections': tf.GraphKeys.UPDATE_OPS,
                    }
                    
                    #定义卷积参数和作用域 包括激活函数 归一化和l2正则化衰减
                    with slim.arg_scope([slim.conv2d],
                                        stride=1,
                                        padding="VALID",
                                        activation_fn=tf.nn.relu,
                                        weights_initializer=tf.glorot_uniform_initializer(),
                                        normalizer_fn=slim.batch_norm,
                                        normalizer_params=batch_norm_params,
                                        weights_regularizer=slim.l2_regularizer(config.WEIGHT_DECAY)):
                        with slim.arg_scope([slim.batch_norm],
                                            **batch_norm_params):

                            #第一个卷积层
                            bbox_net = slim.conv2d(inputs=features,
                                                   num_outputs=1024,
                                                   kernel_size=[self.config.ROI_SIZE, config.ROI_SIZE],
                                                   scope="fc_1")
                            #第二个卷积层
                            bbox_net = slim.conv2d(inputs=bbox_net,
                                              num_outputs=1024,
                                              kernel_size=[1, 1],
                                              scope="fc_2")
                
                #计算每个类别特征与参考特征之间的欧几里德距离的平方
                class_features = tf.expand_dims(features, axis=1)
                reference_feature = tf.expand_dims(reference_feature, axis=0)
                class_net = tf.square(class_features - reference_feature)
                
                # 3D卷积层初始化
                with slim.arg_scope([slim.conv3d],
                                    stride=1,
                                    padding="VALID", #使用“VALID”填充策略，这意味着在不填充的情况下进行卷积操作，输出尺寸会随着卷积核尺寸和步幅而减小。
                                    activation_fn=tf.nn.relu,
                                    weights_initializer=tf.glorot_uniform_initializer(),
                                    normalizer_fn=slim.batch_norm,
                                    normalizer_params=batch_norm_params,
                                    weights_regularizer=slim.l2_regularizer(config.WEIGHT_DECAY)):
                    
                    #3d卷积层用来计算特征之间的相关性
                    class_net = slim.conv3d(inputs=class_net,
                                            num_outputs=1,
                                            kernel_size=[1, 7, 7],
                                            scope="params_dist")
                # care about there is subtract when use weight, it don't need.
                fast_rcnn_scores = tf.squeeze(class_net, axis=[2, 3, 4])  #维度2、3和4上尺寸为1的维度将被移除
                # net = tf.squeeze(net, axis=[1, 2])
                # fast_rcnn_scores = slim.fully_connected(net,
                #                                         config.NUM_CLASS,
                #                                         scope='classifier')
                #通过全连接层进行预测，这一层的输出将包含类别数量乘以4（每个类别对应一个4维的边界框偏移量）的特征
                fast_rcnn_encode_boxes = slim.fully_connected(bbox_net, config.NUM_CLASS * 4,  scope='regressor')

                # 将全连接层的输出重新整形，使其变为一个形状为 [batch_size, num_classes, 4] 的张量。
                #在这里，num_classes 表示类别数量，每个类别都对应一个包含4个偏移量的边界框编码。
                fast_rcnn_encode_boxes = tf.reshape(fast_rcnn_encode_boxes, [-1, config.NUM_CLASS, 4])

                return fast_rcnn_encode_boxes, fast_rcnn_scores
        #对每个批次中的数据进行处理
        fast_rcnn_encode_boxes, fast_rcnn_scores = boxes_utils.batch_slice([features],
                                           lambda x: batch_slice_fast_rcnn_net(x, self.reference_feature,
                                                                               self.config, is_training),
                                           self.config.PER_GPU_IMAGE)

        return fast_rcnn_encode_boxes, fast_rcnn_scores


### 计算fast_rcnn损失 
包括分类损失(交叉熵损失）和位置损失(L1损失）

In [None]:
    def fast_rcnn_loss(self):
    
        #构建fast_rcnn训练目标 包括预测标签 真实标签 物体独热掩码 和 类别
        minibatch_reference_proboxes, minibatch_encode_gtboxes,\
        object_mask, gt_class_ids = self.build_frcnn_target
        
        #获得预测框的池化特征
        pooled_feature = self.get_rois(minibatch_reference_proboxes)
        
        #输入fast_rcnn网络 得到边界框和类别分数
        fast_rcnn_predict_boxes, fast_rcnn_predict_scores = self.fast_rcnn_net(pooled_feature, self.IS_TRAINING)

        #计算fast_rcnn loss
        with tf.variable_scope("fast_rcnn_loss"):
            # trim zero graph
            # minibatch_encode_gtboxes, non_zeros = boxes_utils.trim_zeros_graph(minibatch_encode_gtboxes,
            #                                                                    name="trim_gtbox_finial_loss")
            # object_mask = tf.boolean_mask(object_mask, non_zeros)
            # fast_rcnn_predict_boxes = tf.boolean_mask(fast_rcnn_predict_boxes, non_zeros)
            # fast_rcnn_predict_boxes = tf.reshape(fast_rcnn_predict_boxes, [-1, self.config.NUM_CLASS, 4])
            #
            # fast_rcnn_predict_scores = tf.boolean_mask(fast_rcnn_predict_scores, non_zeros)
            # label_one_hot = tf.boolean_mask(label_one_hot, non_zeros)

            # from fast_rcnn_predict_boxes choose corresponding encode
            row_index = tf.range(0, tf.shape(gt_class_ids)[1])
            row_index = tf.expand_dims(row_index, 0)
            multi_row_index = tf.tile(row_index, [self.config.PER_GPU_IMAGE, 1])
            multi_row_index = tf.expand_dims(multi_row_index, axis=-1)
            expand_gt_class_ids = tf.expand_dims(gt_class_ids, axis=-1)
            index = tf.concat([multi_row_index, expand_gt_class_ids], axis=-1)
            fast_rcnn_predict_boxes = boxes_utils.batch_slice([fast_rcnn_predict_boxes, index],
                                                              lambda x, y: tf.gather_nd(x, y),
                                                              self.config.PER_GPU_IMAGE)

            # loss
            with tf.variable_scope('fast_rcnn_classification_loss'):
                fast_rcnn_classification_loss = tf.losses.sparse_softmax_cross_entropy(labels=gt_class_ids,
                                                                                       logits=fast_rcnn_predict_scores)

                fast_rcnn_classification_loss = tf.cond(tf.is_nan(fast_rcnn_classification_loss), lambda: 0.0,
                                                        lambda: fast_rcnn_classification_loss)

            with tf.variable_scope('fast_rcnn_location_loss'):
                fast_rcnn_location_loss = losses.l1_smooth_losses(predict_boxes=fast_rcnn_predict_boxes,
                                                                  gtboxes=minibatch_encode_gtboxes,
                                                                  object_weights=object_mask)

            return fast_rcnn_location_loss, fast_rcnn_classification_loss

### 对 Fast R-CNN 模型中的目标检测结果进行了一系列预处理和修剪操作，以获得最终的目标检测框的信息
- 除去0
- 解码预测框
- 去除背景类别
- 过滤低置信度的框
- 进行非极大抑制操作 剔除重叠框

In [None]:
    def fast_rcnn_proposals(self, rpn_proposal_bbox, encode_boxes, categories, scores, image_window):
        """
        padding zeros to keep alignments
        :return:
        detection_boxes_scores_labels:(batch_size, config.MAX_DETECTION_INSTANCE, 6)
        """

        def batch_slice_rcnn_proposals(rpn_proposal_bbox,
                                       encode_boxes,
                                       categories,
                                       scores,
                                       image_window,
                                       config):
            """
            mutilclass NMS
            :param rpn_proposal_bbox: (N, 4)
            :param encode_boxes: (N, 4)
            :param categories:(N, )
            :param scores: (N, )
            :param image_window:(y1, x1, y2, x2) the boundary of image
            :return:
            detection_boxes_scores_labels : (-1, 6)[y1, x1, y2, x2, scores, labels]
            """
            with tf.variable_scope('fast_rcnn_proposals'):
                # trim the zero graph 除去面积为0的建议框
                rpn_proposal_bbox, non_zeros = boxes_utils.trim_zeros_graph(rpn_proposal_bbox,
                                                                            name="trim_proposals_detection")
                encode_boxes = tf.boolean_mask(encode_boxes, non_zeros)
                categories = tf.boolean_mask(categories, non_zeros)
                scores = tf.boolean_mask(scores, non_zeros)
                
                #预测框解码为实际边界框，并确保这些边界框不会超出图像边界的操作
                fast_rcnn_decode_boxes = encode_and_decode.decode_boxes(encode_boxes=encode_boxes,
                                                                        reference_boxes=rpn_proposal_bbox,
                                                                        dev_factors=config.BBOX_STD_DEV)
                fast_rcnn_decode_boxes = boxes_utils.clip_boxes_to_img_boundaries(fast_rcnn_decode_boxes,
                                                                                  image_window)

                # remove the background 移除背景类别
                keep = tf.cast(tf.where(categories > 0)[:, 0], tf.int32)
                if DEBUG:
                    print_categories = tf.gather(categories, keep)
                    print_scores = tf.gather(scores, keep)
                    num_item = tf.minimum(tf.shape(print_scores)[0], 50)
                    print_scores_vision, print_index = tf.nn.top_k(print_scores, k=num_item)
                    print_categories_vision = tf.gather(print_categories, print_index)
                    print_tensors(print_categories_vision, "categories")
                    print_tensors(print_scores_vision, "scores")
                    mean_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
                # Filter out low confidence boxes 过滤掉低置信度的预测框  和阈值config.FINAL_SCORE_THRESHOLD相比较
                if config.FINAL_SCORE_THRESHOLD:
                    conf_keep = tf.cast(tf.where(scores >= config.FINAL_SCORE_THRESHOLD)[:, 0], tf.int32)
                    keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
                                                    tf.expand_dims(conf_keep, 0))
                    keep = tf.sparse_tensor_to_dense(keep)[0]

                pre_nms_class_ids = tf.gather(categories, keep)
                pre_nms_scores = tf.gather(scores, keep)
                pre_nms_rois = tf.gather(fast_rcnn_decode_boxes, keep)
                unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
                
                #基于类别的非最大抑制（NMS）操作，用于每个类别的检测框
                def nms_keep_map(class_id):
                    """Apply Non-Maximum Suppression on ROIs of the given class."""
                    # Indices of ROIs of the given class
                    ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0] #找到预NMS阶段中与给定类别 class_id 相等的检测框的索引
                    # Apply NMS
                    class_keep = tf.image.non_max_suppression(
                        tf.gather(pre_nms_rois, ixs),
                        tf.gather(pre_nms_scores, ixs),
                        max_output_size=config.DETECTION_MAX_INSTANCES,
                        iou_threshold=config.FAST_RCNN_NMS_IOU_THRESHOLD) #对给定类别的ROIs（Region of Interest）执行NMS操作，以剔除重叠的检测框
                    # Map indicies
                    class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
                    # Pad with -1 so returned tensors have the same shape
                    gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
                    class_keep = tf.pad(class_keep, [(0, gap)],
                                        mode='CONSTANT', constant_values=-1)
                    # Set shape so map_fn() can infer result shape
                    class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
                    return class_keep
                # 2. Map over class IDs
                nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
                                     dtype=tf.int32)
                # 3. Merge results into one list, and remove -1 padding
                nms_keep = tf.reshape(nms_keep, [-1])
                nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
                # 4. Compute intersection between keep and nms_keep 计算经过 NMS 和后续处理后的保留索引与原始 keep 索引的交集
                keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
                                                tf.expand_dims(nms_keep, 0))
                keep = tf.sparse_tensor_to_dense(keep)[0]
                # Keep top detections 选取最高分数的检测框
                roi_count = config.DETECTION_MAX_INSTANCES
                class_scores_keep = tf.gather(scores, keep)
                num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
                top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
                keep = tf.gather(keep, top_ids)

                #这段代码将处理后的检测框坐标、类别标签和得分整合在一起，形成最终的检测结果，以便后续的输出和使用。
                # Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
                # Coordinates are normalized.
                detections = tf.concat([
                    tf.gather(fast_rcnn_decode_boxes, keep),
                    tf.to_float(tf.gather(categories, keep))[..., tf.newaxis],
                    tf.gather(scores, keep)[..., tf.newaxis]
                ], axis=1)
                
                #如果小于最大检测值，对检测数进行0填充 
                # Pad with zeros if detections < DETECTION_MAX_INSTANCES
                gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
                detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")

                return detections
        
        #在每个gpu上进行批处理
        detections = boxes_utils.batch_slice([rpn_proposal_bbox, encode_boxes, categories, scores, image_window],
                                             lambda x, y, z, u, v: batch_slice_rcnn_proposals(x, y, z, u, v,
                                             self.config), self.config.PER_GPU_IMAGE)
        return detections

### 对特征图进行目标检测 

In [None]:
    def fast_rcnn_detection(self):
        """
        compute the predict bboxes, categories, categories
        :return:
        fast_rcnn_categories_bboxs:(batch_size, num_proposals, 4)
        fast_rcnn_categories_scores:(batch_size, num_propsals)
        fast_rcnn_categories:(batch_size, num_propsals)
        """


        # (batch_size, num_proposal, 7, 7, channels)
        pooled_feature = self.get_rois(self.rpn_proposals_boxes)  #获得ROI池化特征
        fast_rcnn_predict_boxes, fast_rcnn_predict_scores = self.fast_rcnn_net(pooled_feature, False) #预测框和分数

        with tf.variable_scope("fast_rcnn_detection"):
            
            #对预测分数进行softmax归一化操作
            fast_rcnn_softmax_scores = slim.softmax(fast_rcnn_predict_scores)  # [-1, num_classes]

            # gain the highest category and score and bounding box
            fast_rcnn_categories = tf.argmax(fast_rcnn_softmax_scores, axis=2, output_type=tf.int32) # (N,)#得到每个样本每个类别中最大的
            #创建行索引张量
            row_index = tf.range(0, tf.shape(fast_rcnn_categories)[1])
            row_index = tf.expand_dims(row_index, 0)
            multi_row_index = tf.tile(row_index, [self.config.PER_GPU_IMAGE, 1])
            multi_row_index = tf.expand_dims(multi_row_index, axis=-1)
            expand_fast_rcnn_categories = tf.expand_dims(fast_rcnn_categories, axis=-1)
            index = tf.concat([multi_row_index, expand_fast_rcnn_categories], axis=-1)
            fast_rcnn_categories_bboxs = boxes_utils.batch_slice([fast_rcnn_predict_boxes, index],
                                                                 lambda x, y: tf.gather_nd(x, y),
                                                                 self.config.PER_GPU_IMAGE)
            #得到每个ROI预测的最有可能的类别的概率
            fast_rcnn_categories_scores = tf.reduce_max(fast_rcnn_softmax_scores, axis=2, keepdims=False)# (N,)
            
            #对预测框和类别分数进行处理
            detections = self.fast_rcnn_proposals(self.rpn_proposals_boxes,
                                                  fast_rcnn_categories_bboxs,
                                                  fast_rcnn_categories,
                                                  fast_rcnn_categories_scores,
                                                  self.window)

            return detections