In [1]:
from tensorflow.keras import layers as KL
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Conv2D, Dense, DepthwiseConv2D, add
import tensorflow as tf
import numpy as np
import math
from PIL import Image
from random import shuffle

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

2024-06-27 15:35:36.414918: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-27 15:35:36.442271: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-27 15:35:36.442912: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Anchors

In [2]:
# 1. 获取类
def get_classes(classes_path):
    with open(classes_path, encoding='utf-8') as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names, len(class_names)

def get_img_output_length(height, width):
    feature_heights = [15, 8, 4, 2, 1]
    feature_widths = [20, 10, 5, 3, 1]
    return np.array(feature_heights), np.array(feature_widths)

class AnchorBox():
    def __init__(self, input_shape=[120, 160], min_size=None, max_size=None, aspect_ratios=None, flip=True):
        self.input_shape = input_shape
        self.min_size = min_size  # minmum anchor_size
        self.max_size = max_size  # maxmum anchor_size
        self.aspect_ratios = []
        for i in aspect_ratios: # 当aspect_ratios = [1]
            self.aspect_ratios.append(i) # self.aspect_ratios = [1]
            self.aspect_ratios.append(1.0/i)  # self.aspect_ratios = [1, 1]
    
    def call(self, layer_shape, mask=None):
        layer_height = layer_shape[0]  # 输入进来的特征层的高
        layer_width = layer_shape[1]  # 输入进来的特征层的宽
        img_height = self.input_shape[0]  # 输入进来的图片的高
        img_width = self.input_shape[1]  # 输入进来的图片的宽
        
        box_widths = []
        box_heights = []
        for i in self.aspect_ratios:  # for i in [1, 1]:
            # 1. 首先添加一个较小的正方形
            if i == 1 and len(box_widths) == 0:
                box_widths.append(self.min_size)
                box_heights.append(self.min_size)
            # 2. 然后添加一个较大的正方形
            elif i == 1 and len(box_widths) > 0:
                box_widths.append(np.sqrt(self.min_size * self.max_size))
                box_heights.append(np.sqrt(self.min_size * self.max_size))
            # 3. 接着添加长方形
            elif i != 1:
                box_widths.append(self.min_size * np.sqrt(i))
                box_heights.append(self.min_size / np.sqrt(i))
        
        # 划分特征层，计算所有的anchors
        
        # 获得所有先验框的宽高1/2
        # box_widths  = 0.5 * np.array(box_widths)
        # box_heights = 0.5 * np.array(box_heights)
        
        # 每一个特征层对应的步长
        step_x = img_width / layer_width  # width方向的步长，160/20 = 8 
        step_y = img_height / layer_height  # height方向的步长：120/15 = 8
        linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, layer_width)  # np.linspace(0.5*8, 160-0.5*8,20) 定义均匀间隔创建数值序列
        liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, layer_height)  # np.linspace(0.5*8, 120-0.5*8,15) 定义均匀间隔创建数值序列
        
        # 构建网格
        centers_x, centers_y = np.meshgrid(linx, liny)
        centers_x = centers_x.reshape(-1, 1)
        centers_y = centers_y.reshape(-1, 1)
        
        # 每一个先验框需要两个(centers_x, centers_y)，前一个用来计算左上角，后一个计算右下角
        num_anchors_ = len(self.aspect_ratios)
        anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
        anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_)) # Numpy的 tile() 函数，就是将原矩阵横向、纵向地复制。
        
        # 计算先验框的宽高
        box_widths = 0.5 * np.array(box_widths)
        box_heights = 0.5 * np.array(box_heights)
        anchor_boxes[:, ::4] -= box_widths
        anchor_boxes[:, 1::4] -= box_heights
        anchor_boxes[:, 2::4] += box_widths
        anchor_boxes[:, 3::4] += box_heights
        
        # --------------------------------- #
        #   将先验框变成小数的形式
        #   归一化
        # --------------------------------- #
        anchor_boxes[:, ::2] /= img_width
        anchor_boxes[:, 1::2] /= img_height
        anchor_boxes = anchor_boxes.reshape(-1, 4)

        anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
        return anchor_boxes
  
# 2. 获取anchors
def get_anchors(input_shape=[120, 160], anchors_size=[32, 59, 86, 113, 140]):
    # (feature_heights = [15, 8, 4, 2, 1], feature_widths = [20, 10, 5, 3, 1])
    feature_heights, feature_widths = get_img_output_length(input_shape[0], input_shape[1])
    aspect_ratios = [[1], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]  # anchor的长宽比
    anchors = []
    for i in range(len(feature_heights)): # range(5)
        anchors.append(AnchorBox(input_shape, anchors_size[i], max_size=anchors_size[i+1], aspect_ratios=aspect_ratios[i])
                       .call([feature_heights[i], feature_widths[i]]))
    anchors = np.concatenate(anchors, axis=0)
    return anchors      

# Model

In [3]:
def _depthwise_conv_block(inputs, pointwise_conv_filters, depth_multiplier=1, strides=(1, 1), block_id=1, activation="relu"):
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=2, strides=strides, use_bias=True,
                        activation=activation, name='block_%d_conv_dw' % block_id)(inputs)
    x = Conv2D(pointwise_conv_filters, kernel_size=(1, 1), padding="same", use_bias=True, strides=(1, 1),
               name='block_%d_conv_pw' % block_id)(x)
    x = add([inputs, x])
    return x

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Flatten, Dense, Reshape
from keras.layers import Concatenate, Reshape, DepthwiseConv2D

### 1. 定义 MobileNet 网络
# block of DepthwiseConv2D and Conv2D
def _depthwise_conv_block(inputs, pointwise_conv_filters, depth_multiplier=1, strides=(1, 1), block_id=1):
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=strides, use_bias=True,name='block_%d_conv_dw' % block_id)(inputs)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    x = Conv2D(pointwise_conv_filters, kernel_size=(1, 1), padding="same", use_bias=True, strides=strides,name='block_%d_conv_pw' % block_id)(x)
    x = add([inputs, x])
    return x

def _depthwise_conv_block_no_relu(inputs, pointwise_conv_filters, depth_multiplier=1, strides=(1, 1), block_id=1):
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=strides, use_bias=True,name='block_%d_conv_dw' % block_id)(inputs)
    x = Conv2D(pointwise_conv_filters, kernel_size=(1, 1), padding="same", use_bias=True, strides=strides,name='block_%d_conv_pw' % block_id)(x)
    x = add([inputs, x])
    return x

### 1) block of backbone
# block of backbone
def mobilenet(inputs_tensor):
    # --------------------------主干特征提取网络开始--------------------------#
    # SSD结构，net字典
    net = {}
    # inputs_tensor: 120 * 160 * 1 --> 120 * 160 * 8
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=8, strides=(1, 1), use_bias=True, name='DepthWiseConv2D_layer1')(inputs_tensor)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 60 * 80 * 8 
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(2, 2), use_bias=True, name='DepthWiseConv2D_layer2')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 60 * 80 * 8
    x = Conv2D(8, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer3')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 30 * 40 * 16                               
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=2, strides=(2, 2), use_bias=True, name='DepthWiseConv2D_layer4')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 30 * 40 * 8  
    x = Conv2D(8, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer5')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2

    # Block 1~3  
    x = _depthwise_conv_block(x, 8, 1, block_id=1)  # --> 30 * 40 * 8
    x = _depthwise_conv_block(x, 8, 1, block_id=2)  # --> 30 * 40 * 8
    x = _depthwise_conv_block(x, 8, 1, block_id=3)  # --> 30 * 40 * 8             

    # Conv_Depth_Conv_Depth_Conv
    # --> 30 * 40 * 16
    x = Conv2D(16, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer6')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 30 * 40 * 16
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthWiseConv2D_layer7')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 30 * 40 * 16
    x = Conv2D(16, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer8')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 15 * 20 * 32
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=2, strides=(2, 2), use_bias=True, name='DepthWiseConv2D_layer9')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 15 * 20 * 24 
    x = Conv2D(24, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer10')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2

    # Block 4~6
    x = _depthwise_conv_block(x, 24, 1, block_id=4)  # --> 15 * 20 * 24
    x = _depthwise_conv_block(x, 24, 1, block_id=5)  # --> 15 * 20 * 24
    x = _depthwise_conv_block(x, 24, 1, block_id=6)  # --> 15 * 20 * 24

    # --> 15 * 20 * 48
    x = Conv2D(48, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer11')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 15 * 20 * 48
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthWiseConv2D_layer12')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 15 * 20 * 64
    x = Conv2D(64, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer13', kernel_regularizer='l2')(x)
    # x = Conv2D(64, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer13')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    net['split_layer1'] = x

    # start split;
    # --> 8 * 10 * 64
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthWiseConv2D_layer14')(x)
    
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 8 * 10 * 40
    x = Conv2D(40, (1, 1), padding='same', use_bias=True, strides=(2, 2), name='Conv2D_layer15')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # block 7_1 ~7_2
    x = _depthwise_conv_block(x, 40, 1, block_id=7)  # --> 8 * 10 * 40
    x = _depthwise_conv_block(x, 40, 1, block_id=8)  # --> 8 * 10 * 40
    # --> 8 * 10 * 80
    x = Conv2D(80, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer16')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 8 * 10 * 80
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthWiseConv2D_layer17')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 8 * 10 * 80
    x = Conv2D(80, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer18')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    net['split_layer2'] = x

    # --> 4 * 5 * 80
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(2, 2), use_bias=True, name='DepthWiseConv2D_layer19')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 4 * 5 * 80
    x = Conv2D(80, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer20')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    net['split_layer3'] = x

    # --> 2 * 3 * 80
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(2, 2), use_bias=True, name='DepthWiseConv2D_layer21')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    # --> 2 * 3 * 64
    x = Conv2D(64, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer22')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    net['split_layer4'] = x

    # --> 1 * 1 * 64
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(3, 3), use_bias=True, name='DepthWiseConv2D_layer23')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    x = Conv2D(64, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_layer24')(x)
    try:
        x = KL.ReLU(max_value=6.)(x)
    except:
        x = tf.nn.relu(x)  # tf 1.13.2
    net['split_layer5'] = x

    # -------------------------------------主干特征提取网络结束--------------------------------#
    return net

### 2) block of SSD head
def SSD300(input_shape, num_classes=2):

    input_tensor = Input(shape=input_shape)  # 输入为：[120, 160, 1]

    # step1. 提取主干特征；
    net = mobilenet(input_tensor)

    # ---------------------------将提取到的主干特征进行处理--------------------------#
    # 对net['split_layer1']的通道进行l2标准化处理
    # 15, 20, 64
    num_prior = 1
    # 1) layer1-cls-confidence
    net['split_layer1_conf_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_conf_DD1_1')(net['split_layer1'])
    try:
        net['split_layer1_conf_Dep'] = KL.ReLU(max_value=6.)(net['split_layer1_conf_Dep'])
    except:
        net['split_layer1_conf_Dep'] = tf.nn.relu(net['split_layer1_conf_Dep'])
    net['split_layer1_conf_Conv'] = Conv2D(num_classes * 2 * num_prior, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_conf_DD1_2')(net['split_layer1_conf_Dep'])
    # net['split_layer1_conf_Reshape'] = Reshape((600, 2))(net['split_layer1_conf_Conv'])
    net['split_layer1_conf_Reshape'] = Reshape((600, num_classes))(net['split_layer1_conf_Conv'])
    #    layer1-bbox, 4是x, y, h, w的调整
    net['split_layer1_loc_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_loc_DD1_1')(net['split_layer1'])
    try:
        net['split_layer1_loc_Dep'] = KL.ReLU(max_value=6.)(net['split_layer1_loc_Dep'])
    except:
        net['split_layer1_loc_Dep'] = tf.nn.relu(net['split_layer1_loc_Dep'])
    net['split_layer1_loc_Conv'] = Conv2D(2 * num_prior * 4, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_loc_DD1_2')(net['split_layer1_loc_Dep'])
    # net['split_layer1_loc_Reshape'] = Reshape((600, 1, 4))(net['split_layer1_loc_Conv'])  # num_classes-1, 去除背景类别；
    net['split_layer1_loc_Reshape'] = Reshape((600, 4))(net['split_layer1_loc_Conv'])  # num_classes-1, 去除背景类别；

    # 对net['split_layer2']的通道进行处理
    # 8 * 10 * 80
    num_prior = 3
    
    
    # 2) layer2-cls-confidence
    net['split_layer2_conf_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_conf_DD2_1')(net['split_layer2'])
    try:
        net['split_layer2_conf_Dep'] = KL.ReLU(max_value=6.)(net['split_layer2_conf_Dep'])
    except:
        net['split_layer2_conf_Dep'] = tf.nn.relu(net['split_layer2_conf_Dep'])
    net['split_layer2_conf_Conv'] = Conv2D(num_classes * 2 * num_prior, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_conf_DD2_2')(net['split_layer2_conf_Dep'])
    # net['split_layer2_conf_Reshape'] = Reshape((480, 2))(net['split_layer2_conf_Conv'])
    net['split_layer2_conf_Reshape'] = Reshape((480, num_classes))(net['split_layer2_conf_Conv'])
    #    layer2-bbox, 4是x, y, h, w的调整
    net['split_layer2_loc_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_loc_DD2_1')(net['split_layer2'])
    try:
        net['split_layer2_loc_Dep'] = KL.ReLU(max_value=6.)(net['split_layer2_loc_Dep'])
    except:
        net['split_layer2_loc_Dep'] = tf.nn.relu(net['split_layer2_loc_Dep'])
    net['split_layer2_loc_Conv'] = Conv2D(2 * num_prior * 4, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_loc_DD2_2')(net['split_layer2_loc_Dep'])
    # net['split_layer2_loc_Reshape'] = Reshape((480, 1, 4))(net['split_layer2_loc_Conv'])  # num_classes-1, 去除背景类别；
    net['split_layer2_loc_Reshape'] = Reshape((480, 4))(net['split_layer2_loc_Conv'])  # num_classes-1, 去除背景类别；

    # 对net['split_layer3']的通道进行处理
    # 4, 5, 80
    num_prior = 3
    
    
    # 3) layer3-cls-confidence
    net['split_layer3_conf_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_conf_DD3_1')(net['split_layer3'])
    try:
        net['split_layer3_conf_Dep'] = KL.ReLU(max_value=6.)(net['split_layer3_conf_Dep'])
    except:
        net['split_layer3_conf_Dep'] = tf.nn.relu(net['split_layer3_conf_Dep'])
    net['split_layer3_conf_Conv'] = Conv2D(num_prior * 2 * num_classes, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_conf_DD3_2')(net['split_layer3_conf_Dep'])
    # net['split_layer3_conf_Reshape'] = Reshape((120, 2))(net['split_layer3_conf_Conv'])
    net['split_layer3_conf_Reshape'] = Reshape((120, num_classes))(net['split_layer3_conf_Conv'])
    #    layer3-bbox, 4是x, y, h, w的调整
    net['split_layer3_loc_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_loc_DD3_1')(net['split_layer3'])
    try:
        net['split_layer3_loc_Dep'] = KL.ReLU(max_value=6.)(net['split_layer3_loc_Dep'])
    except:
        net['split_layer3_loc_Dep'] = tf.nn.relu(net['split_layer3_loc_Dep'])
    net['split_layer3_loc_Conv'] = Conv2D(2 * num_prior * 4, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_loc_DD3_2')(net['split_layer3_loc_Dep'])
    # net['split_layer3_loc_Reshape'] = Reshape((120, 1, 4))(net['split_layer3_loc_Conv'])  # num_classes-1, 去除背景类别；
    net['split_layer3_loc_Reshape'] = Reshape((120, 4))(net['split_layer3_loc_Conv'])  # num_classes-1, 去除背景类别；

    # 对net['split_layer4']的通道进行处理
    # 2, 3, 64
    num_prior = 3
    
    
    # 4) layer4-cls-confidence
    net['split_layer4_conf_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_conf_DD4_1')(net['split_layer4'])
    try:
        net['split_layer4_conf_Dep'] = KL.ReLU(max_value=6.)(net['split_layer4_conf_Dep'])
    except:
        net['split_layer4_conf_Dep'] = tf.nn.relu(net['split_layer4_conf_Dep'])
    net['split_layer4_conf_Conv'] = Conv2D(num_prior * 2* num_classes, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_conf_DD4_2')(net['split_layer4_conf_Dep'])
    # net['split_layer4_conf_Reshape'] = Reshape((36, 2))(net['split_layer4_conf_Conv'])
    net['split_layer4_conf_Reshape'] = Reshape((36, num_classes))(net['split_layer4_conf_Conv'])
    #    layer4-bbox, 4是x, y, h, w的调整
    net['split_layer4_loc_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_loc_DD4_1')(net['split_layer4'])
    try:
        net['split_layer4_loc_Dep'] = KL.ReLU(max_value=6.)(net['split_layer4_loc_Dep'])
    except:
        net['split_layer4_loc_Dep'] = tf.nn.relu(net['split_layer4_loc_Dep'])
    net['split_layer4_loc_Conv'] = Conv2D(2 * num_prior * 4, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_loc_DD4_2')(net['split_layer4_loc_Dep'])
    # net['split_layer4_loc_Reshape'] = Reshape((36, 1, 4))(net['split_layer4_loc_Conv'])  # num_classes-1, 去除背景类别；
    net['split_layer4_loc_Reshape'] = Reshape((36, 4))(net['split_layer4_loc_Conv'])  # num_classes-1, 去除背景类别；

    # 对net['split_layer5']的通道进行处理
    # 1, 1, 64
    num_prior = 3
    
    
    # 5) layer5-cls-confidence
    net['split_layer5_conf_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_conf_DD5_1')(net['split_layer5'])
    try:
        net['split_layer5_conf_Dep'] = KL.ReLU(max_value=6.)(net['split_layer5_conf_Dep'])
    except:
        net['split_layer5_conf_Dep'] = tf.nn.relu(net['split_layer5_conf_Dep'])
    net['split_layer5_conf_Conv'] = Conv2D(num_prior * 2 * num_classes, (1, 1), padding='same', use_bias=True, strides=(1, 1), name='Conv2D_conf_DD5_2')(net['split_layer5_conf_Dep'])
    # net['split_layer5_conf_Reshape'] = Reshape((6, 2))(net['split_layer5_conf_Conv'])
    net['split_layer5_conf_Reshape'] = Reshape((6, num_classes))(net['split_layer5_conf_Conv'])
    
    #    layer5-bbox, 4是x, y, h, w的调整
    net['split_layer5_loc_Dep'] = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=1, strides=(1, 1), use_bias=True, name='DepthwiseConv2D_loc_DD5_1')(net['split_layer5'])
    try:
        net['split_layer5_loc_Dep'] = KL.ReLU(max_value=6.)(net['split_layer5_loc_Dep'])
    except:
        net['split_layer5_loc_Dep'] = tf.nn.relu(net['split_layer5_loc_Dep'])
    net['split_layer5_loc_Conv'] = Conv2D(2 * num_prior * 4, (1, 1), padding='same', use_bias=True, strides=(1, 1),name='Conv2D_loc_DD5_2')(net['split_layer5_loc_Dep'])
    # net['split_layer5_loc_Reshape'] = Reshape((6, 1, 4))(net['split_layer5_loc_Conv'])  # num_classes-1, 去除背景类别；
    net['split_layer5_loc_Reshape'] = Reshape((6, 4))(net['split_layer5_loc_Conv'])  # num_classes-1, 去除背景类别；

    # 将所有结果进行堆叠
    net['cls_conf'] = Concatenate(axis=1, name='mbox_conf')([net['split_layer1_conf_Reshape'],
                                                              net['split_layer2_conf_Reshape'],
                                                              net['split_layer3_conf_Reshape'],
                                                              net['split_layer4_conf_Reshape'],
                                                              net['split_layer5_conf_Reshape']])
    net['mbox_loc'] = Concatenate(axis=1, name='mbox_loc')([net['split_layer1_loc_Reshape'],
                                                            net['split_layer2_loc_Reshape'],
                                                            net['split_layer3_loc_Reshape'],
                                                            net['split_layer4_loc_Reshape'],
                                                            net['split_layer5_loc_Reshape']])

    # 1242,2
    net['cls_conf'] = Activation('softmax', name='cls_conf_final')(net['cls_conf'])
    # 1242,4
    net['mbox_loc'] = Reshape((1242, 4), name='mbox_loc_final')(net['mbox_loc'])

#分别训练
    # model_return
    # net_output = []
    # net_output.append(net['mbox_loc'])
    # net_output.append(net['cls_conf'])

    # model = Model(input_tensor, net_output)
    
# 一起训练
    # 1242,6
    net['predictions']  = Concatenate(axis =-1, name='predictions')([net['mbox_loc'], net['cls_conf']])

    
    model = Model(input_tensor, net['predictions'])
    
    return model

In [5]:
from PIL import Image
import numpy as np

def cvtColor(image, cvt2color='grey'):
    if not isinstance(image, Image.Image):
        raise TypeError("Input should be a PIL image.")
    
    if cvt2color not in ['grey', 'rgb']:
        raise ValueError(f"Unsupported conversion type '{cvt2color}'. Use 'grey' or 'rgb'.")
    
    if cvt2color == 'grey':
        if image.mode == 'RGB':
            image = image.convert('L')
    elif cvt2color == 'rgb':
        if image.mode != 'RGB':
            image = image.convert('RGB')
    
    return image


# Loss

In [6]:
class MultiboxLoss(object):
    def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0, background_label_id=0, negatives_for_hard=100.0):
        self.num_classes = num_classes
        self.alpha = alpha
        self.neg_pos_ratio = neg_pos_ratio
        if background_label_id != 0:
            raise Exception('Only 0 as background label id is supported')
        self.background_label_id = background_label_id
        self.negatives_for_hard = negatives_for_hard
    
    def _l1_smooth_loss(self, y_true, y_pred):
        abs_loss = tf.abs(y_true - y_pred)
        sq_loss = 0.5 * (y_true - y_pred) ** 2
        l1_loss = tf.where(tf.less(abs_loss, 1.0), sq_loss, abs_loss - 0.5)
        return tf.reduce_sum(l1_loss, -1)

    def _softmax_loss(self, y_true, y_pred):
        y_pred = tf.maximum(y_pred, 1e-7)
        softmax_loss = -tf.reduce_sum(y_true * tf.compat.v1.log(y_pred),axis=-1)
        # softmax_loss = -tf.reduce_sum(y_true * tf.log(y_pred),axis=-1)
        return softmax_loss

    def compute_loc_loss(self, y_true, y_pred):
        num_boxes = tf.compat.v1.to_float(tf.shape(y_true)[1])
        loc_loss = self._l1_smooth_loss(y_true, y_pred)
        loc_loss = loc_loss/num_boxes
        return loc_loss

    def compute_conf_loss(self, y_true, y_pred):
        num_boxes = tf.compat.v1.to_float(tf.shape(y_true)[1])
        conf_loss = self._softmax_loss(y_true, y_pred)
        conf_loss = conf_loss/num_boxes
        return conf_loss
    def compute_loss(self, y_true, y_pred):
        # --------------------------------------------- #
        #   y_true batch_size, 8732, 4 + self.num_classes + 1
        #   y_pred batch_size, 8732, 4 + self.num_classes
        # --------------------------------------------- #
        num_boxes = tf.compat.v1.to_float(tf.shape(y_true)[1])
        # num_boxes = tf.to_float(tf.shape(y_true)[1])

        # --------------------------------------------- #
        #   分类的loss
        #   batch_size,8732,21 -> batch_size,8732
        # --------------------------------------------- #
        conf_loss = self._softmax_loss(y_true[:, :, 4:-1],
                                       y_pred[:, :, 4:])
        # --------------------------------------------- #
        #   框的位置的loss
        #   batch_size,8732,4 -> batch_size,8732
        # --------------------------------------------- #
        loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
                                        y_pred[:, :, :4])

        # --------------------------------------------- #
        #   获取所有的正标签的loss
        # --------------------------------------------- #
        pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -1],
                                     axis=1)
        pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -1],
                                      axis=1)

        # --------------------------------------------- #
        #   每一张图的正样本的个数
        #   num_pos     [batch_size,]
        # --------------------------------------------- #
        num_pos = tf.reduce_sum(y_true[:, :, -1], axis=-1)

        # --------------------------------------------- #
        #   每一张图的负样本的个数
        #   num_neg     [batch_size,]
        # --------------------------------------------- #
        num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
        # 找到了哪些值是大于0的
        pos_num_neg_mask = tf.greater(num_neg, 0)
        # --------------------------------------------- #
        #   如果所有的图，正样本的数量均为0
        #   那么则默认选取100个先验框作为负样本
        # --------------------------------------------- #
        has_min = tf.compat.v1.to_float(tf.reduce_any(pos_num_neg_mask))
        # has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
        num_neg = tf.concat(axis=0, values=[num_neg, [(1 - has_min) * self.negatives_for_hard]])
        
        # --------------------------------------------- #
        #   从这里往后，与视频中看到的代码有些许不同。
        #   由于以前的负样本选取方式存在一些问题，
        #   我对该部分代码进行重构。
        #   求整个batch应该的负样本数量总和
        # --------------------------------------------- #
        num_neg_batch = tf.reduce_sum(tf.boolean_mask(num_neg, tf.greater(num_neg, 0)))
        num_neg_batch = tf.compat.v1.to_int32(num_neg_batch)
        # num_neg_batch = tf.to_int32(num_neg_batch)

        # --------------------------------------------- #
        #   对预测结果进行判断，如果该先验框没有包含物体
        #   那么它的不属于背景的预测概率过大的话
        #   就是难分类样本
        # --------------------------------------------- #
        confs_start = 4 + self.background_label_id + 1
        confs_end   = confs_start + self.num_classes - 1

        # --------------------------------------------- #
        #   batch_size,8732
        #   把不是背景的概率求和，求和后的概率越大
        #   代表越难分类。
        # --------------------------------------------- #
        max_confs = tf.reduce_sum(y_pred[:, :, confs_start:confs_end], axis=2)

        # --------------------------------------------------- #
        #   只有没有包含物体的先验框才得到保留
        #   我们在整个batch里面选取最难分类的num_neg_batch个
        #   先验框作为负样本。
        # --------------------------------------------------- #
        max_confs   = tf.reshape(max_confs * (1 - y_true[:, :, -1]), [-1])
        _, indices  = tf.nn.top_k(max_confs, k=num_neg_batch)

        neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), indices)

        # 进行归一化
        num_pos     = tf.where(tf.not_equal(num_pos, 0), num_pos, tf.ones_like(num_pos))
        total_loss  = tf.reduce_sum(pos_conf_loss) + tf.reduce_sum(neg_conf_loss) + tf.reduce_sum(self.alpha * pos_loc_loss)
        total_loss /= tf.reduce_sum(num_pos)
        return total_loss

# Dataset

In [7]:
# this part need more attention.
import math
import numpy as np
from PIL import Image
from random import shuffle
import keras

class SSDDatasets(keras.utils.Sequence):
    # train_dataloader = SSDDatasets(train_lines, input_shape, anchor, batch_size, num_cls, train=True)
    def __init__(self, annotation_lines, input_shape, anchors, batch_size, num_classes, train, overlap_threshold=0.4, imgcolor='grey'):
        self.annotation_lines = annotation_lines  # 读取数据集
        self.length = len(self.annotation_lines)  # 计算一共多少条数据 348条
        self.input_shape = input_shape             # (120, 160)
        self.anchors = anchors   # [0:1242]: [[0.,0., 0.279.., 0.24...],...]; (1242,4)            
        self.num_anchors = len(anchors)  # 1242
        self.batch_size = batch_size # 1
        self.num_classes = num_classes # 2
        self.train = train # true
        self.overlap_threshold = overlap_threshold  # 0.4
        self.imgcolor = imgcolor # 'grey'
    
    def __len__(self):
        return math.ceil(len(self.annotation_lines) / float(self.batch_size))  # 向上取整
    
    def __getitem__(self ,index):
        image_data = []
        box_data = []
        for i in range(index * self.batch_size, (index + 1) * self.batch_size): # (0,16)
            i = i % self.length # 0~347依次循环
            # 训练时进行数据的随机增强，验证时不进行数据的随机增强；
            image, box = self.get_random_data(self.annotation_lines[i], self.input_shape, random=self.train) 
            
            # ------------------- try image ---------------------
            # print(image)
            # b = Image.fromarray(image)
            # from PIL import ImageDraw
            # a = ImageDraw.ImageDraw(b)
            # a.rectangle((box[0][0],box[0][1],box[0][2],box[0][3]),outline ='black',width =1)
            # import matplotlib.pyplot as plt
            # plt.figure("dog")
            # plt.imshow(b)
            # plt.show()
            # print(box)
            # ----------------------------------------------------
            
            if len(box) != 0:
                boxes = np.array(box[:,:4], np.float32)
                # 进行归一化，调整到0~1之间
                boxes[:,[0,2]] = boxes[:,[0,2]]/(np.array(self.input_shape[1],np.float32))
                boxes[:,[1,3]] = boxes[:,[1,3]]/(np.array(self.input_shape[0],np.float32))
                # 对真实框的种类进行one hot处理
                # one_hot_label = np.eye(self.num_classes - 1)[np.array(box[:, 4], np.int32)]  # [0:2] [array([1.]), array([1.])]
                one_hot_label = np.eye(self.num_classes)[np.array(box[:, 4], np.int32)]  # [0:2] [array([1.]), array([1.])]
                box = np.concatenate([boxes, one_hot_label], axis=-1)
                
            
            box = self.assign_boxes(box)
            image_data.append(image)
            box_data.append(box)
        image_data = np.expand_dims(image_data, axis=-1)
        image_data = image_data.astype(np.float32) / 127.5 - 1.0  # image_data 归一化
        box_data = np.array(box_data)
        # y_true_box = []
        # y_true_box.append(np.array(box_data)[:,:,0:4])
        # y_true_box.append(np.array(box_data)[:,:,4:-1])
        # print(image_data)
        # b = Image.fromarray(image_data)
        # from PIL import ImageDraw
        # a = ImageDraw.ImageDraw(b)
        # a.rectangle((box[0][0],box[0][1],box[0][2],box[0][3]),outline ='black',width =1)
        # import matplotlib.pyplot as plt
        # plt.figure("dog")
        # plt.imshow(b)
        # plt.show()
        # print(box)
        # return image_data, y_true_box
        return image_data, box_data
    
    def on_epoch_end(self):
        shuffle(self.annotation_lines)
        
    def rand(self, a=0, b=1):
        return np.random.rand() * (b - a) + a
               
    def get_random_data(self, annotation_line, input_shape, jitter=.3, random=True):  # jitter颜色相关
        line = annotation_line.split() # ['/VOCdevkit/VOC2007/JPEGImages/002117.jpg','79,281,202,451,0', '106,128,250,297,0']
        image = Image.open(line[0])
        image = cvtColor(image, cvt2color=self.imgcolor)
        iw, ih = image.size # [375,500]
        h, w = input_shape
        box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
        # [79,281,202,451,0], [106,128,250,297,0]
        
        if not random:  # test
            scale = min(w / iw, h / ih)  # 160/375; 120/500
            nw = int(iw * scale)  # 0.24 * 375 = 90
            nh = int(ih * scale)  # 0.24 * 500 = 120
            dx = (w - nw) // 2  # 160-90=70
            dy = (h - nh) // 2  # 120-120=0
            
            #   将图像多余的部分加上灰条
            image = image.resize((nw, nh), Image.BICUBIC)
            new_image = Image.new('L', (w, h))
            new_image.paste(image, (dx, dy))
            image_data = np.array(new_image, np.uint8)
            
            #   对真实框进行调整
            if len(box) > 0:
                np.random.shuffle(box)
                box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
                box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
                box[:, 0:2][box[:, 0:2] < 0] = 0
                box[:, 2][box[:, 2] > w] = w
                box[:, 3][box[:, 3] > h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w > 1, box_h > 1)]  # discard invalid box
            # print(image_data)
            # b = Image.fromarray(image_data)
            # from PIL import ImageDraw
            # a = ImageDraw.ImageDraw(b)
            # a.rectangle((box[0][0],box[0][1],box[0][2],box[0][3]),outline ='black',width =1)
            # import matplotlib.pyplot as plt
            # plt.figure("dog")
            # plt.imshow(b)
            # plt.show()
            # print(box)

            return image_data, box
        
        # train:对图像进行缩放并且进行长和宽的扭曲, ----------------这个比例应该是有问题的；----------------
        new_ar = iw / ih * self.rand(1-jitter, 1+jitter) / self.rand(1-jitter, 1+jitter)  # 随机扭曲程度：1.03924
        scale = self.rand(.25, 2)  # 1.5320
        if new_ar < 1:
            nh = int(scale * h)
            nw = int(nh * new_ar)
        else:
            nw = int(scale * w)  # 245
            nh = int(nw / new_ar)  # 235
        image = image.resize((nw, nh), Image.BICUBIC)
               
        #   将图像多余的部分加上灰条
        dx = int(self.rand(0, w - nw))
        dy = int(self.rand(0, h - nh))
        new_image = Image.new('L', (w, h)) # w=160,h=120
        new_image.paste(image, (dx, dy))  # paste(self, im, box=None, mask=None) 将一张图片覆盖到另外一张图片的指定位置去
        # 这里是将image（w=160，h=120）贴到new_image(w=160,h=120)的坐标为(dx, dy)的位置，以图片左上角为坐标原点；
        image = new_image
        
        #  翻转图像
        flip = self.rand() < .5
        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
        
        image_data = np.array(image, np.uint8)
        #  对真实框进行调整
        if len(box) > 0:
            np.random.shuffle(box)
            box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
            box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
            if flip: box[:, [0, 2]] = w - box[:, [2, 0]]
            box[:, 0:2][box[:, 0:2] < 0] = 0
            box[:, 2][box[:, 2] > w] = w
            box[:, 3][box[:, 3] > h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w > 1, box_h > 1)]

        return image_data, box
    
    def iou(self, box):# box=[0.375, 0.25, 0.59, 0.59]
        # ---------------------------------------------#
        #   计算出每个真实框与所有的先验框的iou
        #   判断真实框与先验框的重合情况
        # ---------------------------------------------#
        # print(np.shape(self.anchors))
        # tmp_anchor_x = self.anchors[:][:2]
        # tmp_anchor_y = self.anchors[:][2:4]
        # tmp_anchor_x1 = [i[:2] for i in self.anchors]
        # tmp_anchor_y1 = [i[:2] for i in self.anchors]
        # inter_upleft = np.array([np.maximum(i,np.array(box[:2]))  for i in tmp_anchor_x])
        # inter_botright = np.array([np.minimum(i,np.array(box[2:]))  for i in tmp_anchor_y])
        inter_upleft = np.maximum(self.anchors[:, :2], box[:2])
        inter_botright = np.minimum(self.anchors[:, 2:4], box[2:])
        inter_wh = inter_botright - inter_upleft
        inter_wh = np.maximum(inter_wh, 0)
        inter = inter_wh[:, 0] * inter_wh[:, 1]
        # ---------------------------------------------#
        #   真实框的面积
        # ---------------------------------------------#
        area_true = (box[2] - box[0]) * (box[3] - box[1])
        # ---------------------------------------------#
        #   先验框的面积
        # ---------------------------------------------#
        area_gt = (self.anchors[:, 2] - self.anchors[:, 0]) * (self.anchors[:, 3] - self.anchors[:, 1])
        # ---------------------------------------------#
        #   计算iou
        # ---------------------------------------------#
        union = area_true + area_gt - inter

        iou = inter / union
        return iou

    def encode_box(self, box, return_iou=True, variances=[0.1, 0.1, 0.2, 0.2]):# box=[0.375, 0.25, 0.59, 0.59]
        # ---------------------------------------------#
        #   计算当前真实框和先验框的重合情况
        #   iou [self.num_anchors] (1242,)
        #   encoded_box [self.num_anchors, 5]
        # ---------------------------------------------#
        iou = self.iou(box)  # (1242,)
        encoded_box = np.zeros((self.num_anchors, 4 + return_iou))

        # ---------------------------------------------#
        #   找到每一个真实框，重合程度较高的先验框
        #   真实框可以由这个先验框来负责预测
        # ---------------------------------------------#
        assign_mask = iou > self.overlap_threshold

        # ---------------------------------------------#
        #   如果没有一个先验框重合度大于self.overlap_threshold
        #   则选择重合度最大的为正样本
        # ---------------------------------------------#
        if not assign_mask.any():
            assign_mask[iou.argmax()] = True

        # ---------------------------------------------#
        #   利用iou进行赋值 
        # ---------------------------------------------#
        if return_iou:
            encoded_box[:, -1][assign_mask] = iou[assign_mask]

        # ---------------------------------------------#
        #   找到对应的先验框
        # ---------------------------------------------#
        assigned_anchors = self.anchors[assign_mask]

        # ---------------------------------------------#
        #   逆向编码，将真实框转化为ssd预测结果的格式
        #   先计算真实框的中心与长宽
        # ---------------------------------------------#
        box_center = 0.5 * (box[:2] + box[2:])
        box_wh = box[2:] - box[:2]
        # ---------------------------------------------#
        #   再计算重合度较高的先验框的中心与长宽
        # ---------------------------------------------#
        assigned_anchors_center = (assigned_anchors[:, 0:2] + assigned_anchors[:, 2:4]) * 0.5
        assigned_anchors_wh = (assigned_anchors[:, 2:4] - assigned_anchors[:, 0:2])

        # ------------------------------------------------#
        #   逆向求取ssd应该有的预测结果
        #   先求取中心的预测结果，再求取宽高的预测结果
        #   存在改变数量级的参数，默认为[0.1,0.1,0.2,0.2]
        # ------------------------------------------------#
        encoded_box[:, :2][assign_mask] = box_center - assigned_anchors_center
        encoded_box[:, :2][assign_mask] /= assigned_anchors_wh
        encoded_box[:, :2][assign_mask] /= np.array(variances)[:2]

        encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_anchors_wh)
        encoded_box[:, 2:4][assign_mask] /= np.array(variances)[2:4]
        return encoded_box.ravel()
  
    def assign_boxes(self, boxes): # boxes=[[0.375, 0.25, 0.59, 0.59, 0, 1.],[0.33, 0.55, 0.518, 0.899, 0, 1.]]
        # ---------------------------------------------------#
        #   assignment分为3个部分
        #   :4      的内容为网络应该有的回归预测结果
        #   4:-1    的内容为先验框所对应的种类，默认为背景
        #   -1      的内容为当前先验框是否包含目标
        # ---------------------------------------------------#
        assignment = np.zeros((self.num_anchors, 4 + self.num_classes + 1))
        assignment[:, 4] = 1.0  # self.num_classes --> [0,0] --> [1,0] --> 背景，默认为背景；
        if len(boxes) == 0:  #表示boxes为0， 没有ground truth, 所有的先验框都是背景
            return assignment

        # 当boxes不为0，表示有groundtruth, 所以对每一个真实框都进行iou计算； boxes=[0.375, 0.25, 0.59, 0.59],[0.33, 0.55, 0.518, 0.899]
        encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
        # ---------------------------------------------------#
        #   在reshape后，获得的encoded_boxes的shape为：
        #   [num_true_box, num_anchors, 4 + 1]
        #   4是编码后的结果，1为iou
        # ---------------------------------------------------#
        encoded_boxes = encoded_boxes.reshape(-1, self.num_anchors, 5)

        # ---------------------------------------------------#
        #   [num_anchors]求取每一个先验框重合度最大的真实框
        # ---------------------------------------------------#
        best_iou = encoded_boxes[:, :, -1].max(axis=0)
        best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
        best_iou_mask = best_iou > 0
        best_iou_idx = best_iou_idx[best_iou_mask]

        # ---------------------------------------------------#
        #   计算一共有多少先验框满足需求
        # ---------------------------------------------------#
        assign_num = len(best_iou_idx)

        # 将编码后的真实框取出
        encoded_boxes = encoded_boxes[:, best_iou_mask, :]
        # ---------------------------------------------------#
        #   编码后的真实框的赋值
        # ---------------------------------------------------#
        assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num), :4]
        # ----------------------------------------------------------#
        #   4代表为背景的概率，设定为0，因为这些先验框有对应的物体
        # ----------------------------------------------------------#
        assignment[:, 4][best_iou_mask] = 0
        assignment[:, 5:-1][best_iou_mask] = boxes[best_iou_idx, 5:]
        # ----------------------------------------------------------#
        #   -1表示先验框是否有对应的物体
        # ----------------------------------------------------------#
        assignment[:, -1][best_iou_mask] = 1
        # 通过assign_boxes我们就获得了，输入进来的这张图片，应该有的预测结果是什么样子的
        return assignment


# Main

In [18]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping

def main():
    try:
        # 设置训练参数
        Epoch = 1000  # 训练1000 epochs
        lr = 1e-4  # Adam优化器的学习率
        momentum = 0.937
        batch_size = 16
        imgcolor = 'grey'  # 图像处理的颜色空间

        # 设置SSD参数
        cls_name_path = '/home/zhangyouan/桌面/zya/NN_net/network/SSD/IMX_681_ssd_mobilenet_git/keras/detection/VOC_dataset/voc_classes.txt'  # 类别文件路径
        input_shape = [120, 160]  # 输入尺寸
        anchor_size = [32, 59, 86, 113, 141, 168]  # 先验框大小    
        # train_annotation_path = '/home/zhangyouan/桌面/zya/NN_net/network/SSD/IMX_681_ssd_mobilenet_git/keras/detection/VOC_dataset/2007_train.txt'
        # val_annotation_path = '/home/zhangyouan/桌面/zya/NN_net/network/SSD/IMX_681_ssd_mobilenet_git/keras/detection/VOC_dataset/2007_val.txt'
        train_annotation_path = r'/home/zhangyouan/桌面/zya/dataset/681/hand/2007_train.txt'
        val_annotation_path = r'/home/zhangyouan/桌面/zya/dataset/681/hand/2007_test.txt'
     
        # 1. 获取classes和anchors
        class_names, num_cls = get_classes(cls_name_path)
        num_cls += 1  # 增加背景类别
        print("class_names:", class_names, "num_classes:", num_cls)
        
        # 2. 获取anchors
        anchor = get_anchors(input_shape, anchor_size)
        print("type:", type(anchor), "shape:", np.shape(anchor))

        # 3. 清理session并创建模型
        K.clear_session()
        model = SSD300((input_shape[0], input_shape[1], 1), num_cls)
        
        # 4. 设置优化器
        optimizer = Adam(lr=lr, beta_1=momentum)
        
        # 5. 导入数据集
        with open(train_annotation_path, encoding='utf-8') as f:
            train_lines = f.readlines()
        with open(val_annotation_path, encoding='utf-8') as f:
            val_lines = f.readlines()
        num_train = len(train_lines)
        num_val = len(val_lines)
        epoch_step = num_train // batch_size
        epoch_step_val = num_val // batch_size

        # 数据增强设置
        train_dataloader = SSDDatasets(train_lines, input_shape, anchor, batch_size, num_cls, train=True, imgcolor=imgcolor)
        val_dataloader = SSDDatasets(val_lines, input_shape, anchor, batch_size, num_cls, train=False, imgcolor=imgcolor)
        print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
        
        # 6. 编译模型
        model.compile(optimizer=optimizer, loss=MultiboxLoss(num_cls, neg_pos_ratio=3.0).compute_loss)

        # 7. 设置回调函数
        checkpoint = ModelCheckpoint('ssd_weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=20, verbose=1)
        csv_logger = CSVLogger('training.log')
        early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1)

        # 8. 训练模型
        model.fit(
            train_dataloader,
            validation_data=val_dataloader,
            steps_per_epoch=epoch_step,
            validation_steps=epoch_step_val,
            epochs=Epoch,
            callbacks=[checkpoint, reduce_lr, csv_logger, early_stopping]
        )

        # 9. 保存最终模型
        model.save("final_detection_model_det_good_hand.h5")
        print("Model training completed and saved successfully.")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


class_names: ['good'] num_classes: 2
type: <class 'numpy.ndarray'> shape: (1242, 4)




Train on 1440 samples, val on 178 samples, with batch size 16.
Epoch 1/1000


  image = image.resize((nw, nh), Image.BICUBIC)
  if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)




  image = image.resize((nw, nh), Image.BICUBIC)


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000


KeyboardInterrupt: 