## trainer.py

In [None]:
class FasterRCNNTrainer(nn.Module):
    """wrapper for conveniently training. return losses
    The losses include:
    * :obj:`rpn_loc_loss`: The localization loss for \
        Region Proposal Network (RPN).
    * :obj:`rpn_cls_loss`: The classification loss for RPN.
    * :obj:`roi_loc_loss`: The localization loss for the head module.
    * :obj:`roi_cls_loss`: The classification loss for the head module.
    * :obj:`total_loss`: The sum of 4 loss above.
    Args:
        faster_rcnn (model.FasterRCNN):
            A Faster R-CNN model that is going to be trained.
    """

    def __init__(self, faster_rcnn):
        super(FasterRCNNTrainer, self).__init__()

        self.faster_rcnn = faster_rcnn
        self.rpn_sigma = opt.rpn_sigma
        self.roi_sigma = opt.roi_sigma 是在_faster_rcnn_loc_loss调用用来计算位置损失函数用到的超参数，

        # target creator create gt_bbox gt_label etc as training targets. 
        #用于从20000个候选anchor中产生256个anchor进行二分类和位置回归，
        #也就是为rpn网络产生的预测位置和预测类别提供真正的ground_truth标准
        self.anchor_target_creator = AnchorTargetCreator()
        #AnchorTargetCreator和ProposalTargetCreator是为了生成训练的目标（或称ground truth），
        #只在训练阶段用到，ProposalCreator是RPN为Fast R-CNN生成RoIs，在训练和测试阶段都会用到。
        #所以测试阶段直接输进来300个RoIs，而训练阶段会有AnchorTargetCreator的再次干预。
        self.proposal_target_creator = ProposalTargetCreator()

        self.loc_normalize_mean = faster_rcnn.loc_normalize_mean #(0., 0., 0., 0.)
        self.loc_normalize_std = faster_rcnn.loc_normalize_std #(0.1, 0.1, 0.2, 0.2)

        self.optimizer = self.faster_rcnn.get_optimizer() #SGD
        # visdom wrapper
        self.vis = Visualizer(env=opt.env)

        # indicators for training status
        self.rpn_cm = ConfusionMeter(2)#混淆矩阵
        self.roi_cm = ConfusionMeter(21)
        self.meters = {k: AverageValueMeter() for k in LossTuple._fields}  # average loss

    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.
        Here are notations used.
        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.
        Currently, only :math:`N=1` is supported.
        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.
        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0] #获取batch个数
        if n != 1: #本程序只支持batch_size=1
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs) #vgg16 conv5_3之前的部分提取图片的特征
        #通过RPN提取ROI相关的信息
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois


        #调用proposal_target_creator函数生成sample roi（128，4）、gt_roi_loc（128，4）、gt_roi_label（128，1），
        #RoIHead网络利用这sample_roi+feature为输入，输出是分类（21类）和回归（进一步微调bbox）的预测值，
        #那么分类回归的groud truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)#输入20000个anchor和bbox，调用anchor_target_creator函数得到2000个anchor与bbox的偏移量与label
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,
            gt_rpn_loc,
            gt_rpn_label.data,
            self.rpn_sigma) #使用_smooth_l1_loss
        #rpn_loc为rpn网络回归出来的偏移量（20000个），
        #gt_rpn_loc为anchor_target_creator函数得到2000个anchor与bbox的偏移量，rpn_sigma=1.
        
        #rpn_score为rpn网络得到的（20000个）与anchor_target_creator得到的2000个label求交叉熵损失
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] #在RPN不计算背景类
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        #roi_cls_loc为VGG16RoIHead的输出（128*84）， n_sample=128
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # roi_cls_loc=（128,21,4）
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.totensor(gt_roi_label).long() 
        gt_roi_loc = at.totensor(gt_roi_loc) #128个标签

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma) #采用smooth_l1_loss
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)



    def _smooth_l1_loss(x, t, in_weight, sigma):
        sigma2 = sigma ** 2
        diff = in_weight * (x - t)
        abs_diff = diff.abs()
        flag = (abs_diff.data < (1. / sigma2)).float()
        y = (flag * (sigma2 / 2.) * (diff ** 2) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
        return y.sum()


def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    #输入分别为rpn回归框的偏移量与anchor与bbox的偏移量以及label
    in_weight = t.zeros(gt_loc.shape).cuda()
    # Localization loss is calculated only for positive rois.
    # NOTE:  unlike origin implementation, 
    # we don't need inside_weight and outside_weight, they can calculate by gt_label
    in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma) #sigma设置为1
    # Normalize by total number of negtive and positive rois.
    loc_loss /= ((gt_label >= 0).sum().float()) # ignore gt_label==-1 for rpn_loss #除去背景类
    return loc_loss

## train.py

In [None]:
def train(**kwargs):
    opt._parse(kwargs)
#将调用函数时候附加的参数用，config.py文件里面的opt._parse()进行解释，然后获取其数据存储的路径，之后放到Dataset里面！
    dataset = Dataset(opt)
    dataset = Dataset(opt)
    print('load data')
#用VOCBboxDataset作为数据读取库，然后依次从样例数据库中读取图片出来，还调用了Transform(object)函数，完成图像的调整和随机反转工作！
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
#将数据装载到dataloader中，shuffle=True允许数据打乱排序，num_workers是设置数据分为几批处理，同样的将测试数据集也进行同样的处理，然后装载到test_dataloader中！
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
#设置trainer = FasterRCNNTrainer(faster_rcnn).cuda()
#将FasterRCNNVGG16作为fasterrcnn的模型送入到FasterRCNNTrainer中并设置好GPU加速    
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
##接下来判断opt.load_path是否存在，如果存在，直接从opt.load_path读取预训练模型，然后将训练数据的label进行可视化操作  
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
#用一个for循环开始训练过程，而训练迭代的次数opt.epoch=14也在config.py文件中都预先定义好，属于超参数  
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            #然后从训练数据中枚举dataloader,设置好缩放范围，将img,bbox,label,scale全部设置为可gpu加速
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
             #调用trainer.py中的函数trainer.train_step(img,bbox,label,scale)进行一次参数迭代优化过程
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace() # 判断数据读取次数是否能够整除plot_every(是否达到了画图次数)，
                    #如果达到判断debug_file是否存在，用ipdb工具设置断点，
                    #调用trainer中的trainer.vis.plot_many(trainer.get_meter_data())将训练数据读取并上传完成可视化！


                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)
                #将每次迭代读取的图片用dataset文件里面的inverse_normalize()函数进行预处理，将处理后的图片调用Visdom_bbox 

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)
                #利用同样的方法将原始图片以及边框类别的预测结果同样在可视化工具中显示出来！

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map']) #调用trainer.vis.img将Roi_cm将roi的可视化矩阵以图片的形式显示出来
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] #设置学习的learning rate
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info) #将损失学习率以及map等信息及时显示更新

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)  #用if判断语句永远保存效果最好的map
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay #if判断语句如果学习的epoch达到了9就将学习率*0.1变成原来的十分之一

        if epoch == 13: 
            break #判断epoch==13结束训练验证过程