<table align="left"><td>
  <a target="_blank"  href="https://github.com/Dhruv0208/Research-Project/Autoencoder.ipynb">
    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on github
  </a>
</td><td>
  <a target="_blank"  href="https://colab.sandbox.google.com/github/Dhruv0208/Research-Project/Autoencoder.ipynb">
    <img width=32px src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
</td></table>

In [None]:
# import sys
# !{sys.executable} -m pip install keras
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install imutils
# !{sys.executable} -m pip install opencv-python
# !{sys.executable} -m pip install sklearn
# !{sys.executable} -m pip install --upgrade pip
# !{sys.executable} -m pip install -q tfds-nightly tensorflow
# !{sys.executable} -m pip install pydot
# !{sys.executable} -m pip install graphviz
# !{sys.executable} -m pip install keras-pos-embd

In [1]:
import keras
from tensorflow.keras import layers
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img 
from tensorflow.keras.preprocessing.image import img_to_array
from keras.layers import Input
from keras.preprocessing import image
from keras import backend as K
from tensorflow.python.keras.utils.vis_utils import plot_model
from typing import Union, Dict, Tuple
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import numpy as np
import time
import json
from functools import partial
import os
from imutils import paths
import cv2
import IPython.display as display
from keras_pos_embd import TrigPosEmbedding
from tensorflow.keras.layers import Dropout, Activation, LayerNormalization, Dense

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}',format(device_name))

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
IMAGE_SIZE = [512,512]
BATCH_SIZE = 4

In [None]:
file_pattern_train = f'coco/2017/1.1.0/coco-train.tfrecord*'
file_pattern_test = f'coco/2017/1.1.0/coco-test.tfrecord*'
file_pattern_valid = f'coco/2017/1.1.0/coco-validation.tfrecord*'
Training_filenames = tf.data.Dataset.list_files(file_pattern_train)
Test_filenames = tf.data.Dataset.list_files(file_pattern_test)
Valid_filenames = tf.data.Dataset.list_files(file_pattern_valid)
print("Train Tfrecords Files: ", len(Training_filenames))
print("Test Tfrecords Files: ", len(Test_filenames))
print("Valid Tfrecords Files: ", len(Valid_filenames))

In [None]:
def decode_image(image):
    image = tf.io.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    paddings = tf.constant([[1,1], [1,1], [0,0]])
    image = tf.pad(image, paddings, "CONSTANT")
    image = tf.image.resize(image, [*IMAGE_SIZE])
    return image

In [None]:
def read_tfrecord(serialized_example):
    feature_description={
        "image": tf.io.FixedLenFeature([], tf.string,)
    }
    example = tf.io.parse_single_example(
    serialized_example, feature_description
    )
    image = decode_image(example['image'])
    return image

In [None]:
def load_dataset(filenames):
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(
        filenames)
    dataset = dataset.with_options(ignore_order)
    dataset =dataset.map(
        partial(read_tfrecord), num_parallel_calls=AUTOTUNE)
    return dataset

In [None]:
def get_dataset(filenames):
    dataset = load_dataset(filenames)
    dataset = dataset.shuffle(1024)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

In [None]:
train_dataset = get_dataset(Training_filenames)
valid_dataset = get_dataset(Valid_filenames)
test_dataset = get_dataset(Test_filenames)
# train_dist_datset = strategy.experimental_distribute_dataset(train_dataset)
# test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)
# valid_dist_dataset = strategy.experimental_distribute_dataset(valid_dataset)
image_batch = next(iter(train_dataset))
def show_batch_original(image_batch):
    plt.figure(figsize=(10,10))
    for n in range(4):
        ax = plt.subplot(5,5, n+1)
        print(image_batch[n].shape)
        plt.imshow(image_batch[n] / 255.0)
        plt.axis("off")
        
show_batch_original(image_batch.numpy())

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

## Preparing autoencoder

In [None]:
input_img = Input(shape=(512,512,3))
x = layers.Conv2D(256, (1,1), activation='relu', padding='same')(input_img)
x = layers.MaxPooling2D((2,2), padding = 'same')(x)
x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2), padding = 'same')(x)
x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2), padding = 'same')(x)
x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2), padding = 'same')(x)
x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2), padding = 'same')(x)
x = layers.Conv2D(16, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2), padding='same')(x)
x = layers.Conv2D(16, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
encoded = layers.MaxPooling2D((2,2), padding='same')(x)
x = layers.Conv2D(16, (3,3), activation='relu', padding='same')(encoded)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(16, (3,3), activation = 'relu', padding = 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(32, (3,3), activation = 'relu', padding= 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(64, (3,3), activation = 'relu', padding= 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(64, (3,3), activation = 'relu', padding= 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(128, (3,3), activation = 'relu', padding= 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(256, (3,3), activation = 'relu', padding= 'same')(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D((2,2))(x)
decoded = layers.Conv2D(3, (3,3), activation='sigmoid', padding='same')(x)


autoencoder = tf.keras.Model(inputs=input_img, outputs=decoded)
middle = tf.keras.Model(inputs=input_img, outputs=encoded)

In [None]:
middle.summary()

In [None]:
autoencoder.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
loss_fn = tfa.losses.SigmoidFocalCrossEntropy(from_logits=True)

## Bounding boxes

In [2]:
def bbox_xcycwh_to_x1y1x2y2(bbox_xcycwh : np.array):
    
    
    bbox_x1y1x2y2 = np.zeros_like((bbox_xcycwh))
    bbox_x1y1x2y2[:,0] = bbox_xcycwh[:,0] - (bbox_xcycwh[:,2] / 2)
    bbox_x1y1x2y2[:,2] = bbox_xcycwh[:,0] + (bbox_xcycwh[:,2] / 2)
    bbox_x1y1x2y2[:,1] = bbox_xcycwh[:,1] - (bbox_xcycwh[:,3] / 2)
    bbox_x1y1x2y2[:,3] = bbox_xcycwh[:,1] - (bbox_xcycwh[:,3] / 2)
    bbox_x1y1x2y2 = bbox_x1y1x2y2.astype(np.int32)
    return bbox_x1y1x2y2

def intersect(box_a : tf.Tensor, box_b : tf.Tensor) -> tf.Tensor:
    A = tf.shape(box_a)[0]
    B = tf.shape(box_b)[0]
    
    tiled_box_a_xymax = tf.tile(tf.expand_dims(box_a[:,2:], axis=1), [1, B, 1])
    tiled_box_b_xymax = tf.tile(tf.expand_dims(box_b[:,2:], axis=0), [A, 1, 1])
    
    above_right_corner = tf.math.minimum(tiled_box_a_xymax, tiled_box_b_xymax)
    
    tiled_box_a_xymin = tf.tile(tf.expand_dims(box_a[:, :2], axis=1), [1,B,1])
    tiled_box_b_xymin = tf.tile(tf.expand_dims(box_b[:, :2], axis=0), [A,1,1])
    upper_left_corner = tf.math.maximum(tiled_box_a_xymin, tiled_box_b_xymin)
    
    inter = tf.nn.relu(above_right_corner - upper_left_corner)
    inter = inter[:, :, 0] * inter[:, :, 1]
    return inter

def overlap(box_a: tf.Tensor, box_b: tf.Tensor, return_union=False) -> tf.Tensor:
    inter = intersect(box_a, box_b)
    
    area_a = (box_a[:,2] - box_a[:,0]) * (box_a[:,3] - box_a[:,1])
    area_a = tf.tile(tf.expand_dims(area_a, axis=-1), [1, tf.shape(inter)[-2], 1])
    
    area_b = (box_b[:,2] - box_b[:,0]) * (box_b[:,3] - box_b[:,1])
    area_b = tf.tile(tf.expand_dims(area_b, axis=-2), [1, tf.shape(inter)[-2], 1])
    
    union = area_a + area_b - inter
    
    
    if return_union is False:
        return intersect/union
    else:
        return inter/union, union
    
def merge(box_a: tf.Tensor, box_b: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    
    A = tf.shape(box_a)[0]
    B = tf.shape(box_b)[0]
    
    tiled_box_a = tf.tile(tf.expand_dims(box_a, axis=1), [1, B, 1])
    tiled_box_b = tf.tile(tf.expand_dims(box_b, axis=0), [A, 1, 1])
    
    return tiled_box_a, tiled_box_b


def xy_min_xy_max_to_xcycwh(bbox: tf.Tensor) -> tf.Tensor:
    """
    Convert bbox from shape [xmin, ymin, xmax, ymax] to [xc, yc, w, h]
    Args:
        bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
    Returns:
        The converted bbox
    """
    bbox_xcycwh = tf.concat([bbox[:, :2] + ((bbox[:, 2:] - bbox[:, :2]) / 2), bbox[:, 2:] - bbox[:, :2]], axis=-1)
    return bbox_xcycwh



def xcycwh_to_xy_min_xy_max(bbox: tf.Tensor) -> tf.Tensor:
    """
    Convert bbox from shape [xc, yc, w, h] to [xmin, ymin, xmax, ymax]
    Args:
        bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
    Returns:
        The converted bbox
    """
    bbox_xyxy = tf.concat([bbox[:, :2] - (bbox[:, 2:] / 2), bbox[:, :2] + (bbox[:, 2:] / 2)], axis=-1)
    
    bbox_xyxy = tf.clip_by_value(bbox_xyxy, 0.0, 1.0)
    return bbox_xyxy

## Preparing Transformer

## BACKBONE

In [3]:
class BackBone(tf.keras.Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.conv1 = layers.Conv2D(256, (1,1), padding='same',
                            activation='relu', name='conv1')
        self.conv2 = layers.Conv2D(128, (3,3), padding='same',
                            activation='relu', name='conv2')
        self.conv3 = layers.Conv2D(64, (3,3), padding='same',
                            activation='relu', name='conv3')
        self.conv4 = layers.Conv2D(64, (3,3), padding='same',
                            activation='relu', name='conv4')
        self.conv5 = layers.Conv2D(32, (3,3), padding='same',
                            activation='relu', name='conv4')
        self.conv6 = layers.Conv2D(16, (3,3), padding='same',
                            activation='relu', name='conv6')
        self.conv7 = layers.Conv2D(16, (3,3), padding='same',
                            activation='relu', name='conv7')
        self.bn1 = layers.BatchNormalization(name='bn1')
        self.bn2 = layers.BatchNormalization(name='bn2')
        self.bn3 = layers.BatchNormalization(name='bn3')
        self.bn4 = layers.BatchNormalization(name='bn4')
        self.bn5 = layers.BatchNormalization(name='bn5')
        self.bn6 = layers.BatchNormalization(name='bn6')
        self.maxpool = layers.MaxPooling2D((2,2), padding='same')


    def call(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.bn1(x)
        x = self.maxpool(x)
        x = self.conv3(x)
        x = self.bn2(x)
        x = self.maxpool(x)
        x = self.conv4(x)
        x = self.bn3(x)
        x = self.maxpool(x)
        x = self.conv5(x)
        x = self.bn4(x)
        x = self.maxpool(x)
        x = self.conv6(x)
        x = self.bn5(x)
        x = self.maxpool(x)
        x = self.conv7(x)
        x = self.bn6(x)
        x = self.maxpool(x)
        return x

## 1. Encoder and Decoder layers

In [4]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, model_dim, num_heads, dropout=0.0, **kwargs):
        super().__init__(**kwargs)

        self.model_dim = model_dim
        self.num_heads = num_heads

        assert model_dim % num_heads == 0
        self.head_dim = model_dim // num_heads

        self.dropout = Dropout(rate=dropout)
        

    def build(self, input_shapes):
        in_dim = sum([shape[-1] for shape in input_shapes[:3]])

        self.in_proj_weight = self.add_weight(
            name='in_proj_kernel', shape=(in_dim, self.model_dim),
            initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
        )
        self.in_proj_bias = self.add_weight(
            name='in_proj_bias', shape=(in_dim,),
            initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
        )
        self.out_proj_weight = self.add_weight(
            name='out_proj_kernel', shape=(self.model_dim, self.model_dim),
            initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
        )
        self.out_proj_bias = self.add_weight(
            name='out_proj_bias', shape=(self.model_dim,),
            initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
        )




        #self.in_proj_weight = tf.Variable(
        #    tf.zeros((in_dim, self.model_dim), dtype=tf.float32), name='in_proj_kernel')
        #self.in_proj_bias = tf.Variable(tf.zeros((in_dim,), dtype=tf.float32),
        #                                name='in_proj_bias')

        #self.out_proj_weight = tf.Variable(
        #    tf.zeros((self.model_dim, self.model_dim), dtype=tf.float32), name='out_proj_kernel')
        #self.out_proj_bias = tf.Variable(
        #    tf.zeros((self.model_dim,), dtype=tf.float32), name='out_proj_bias')



    def call(self, inputs, attn_mask=None, key_padding_mask=None,
             need_weights=True, training=False):

        query, key, value = inputs

        batch_size = tf.shape(query)[1]
        target_len = tf.shape(query)[0]
        source_len = tf.shape(key)[0]

        W = self.in_proj_weight[:self.model_dim, :]
        b = self.in_proj_bias[:self.model_dim]

        WQ = tf.matmul(query, W, transpose_b=True) + b

        W = self.in_proj_weight[self.model_dim:2*self.model_dim, :]
        b = self.in_proj_bias[self.model_dim:2*self.model_dim]
        WK = tf.matmul(key, W, transpose_b=True) + b

        W = self.in_proj_weight[2*self.model_dim:, :]
        b = self.in_proj_bias[2*self.model_dim:]
        WV = tf.matmul(value, W, transpose_b=True) + b

        WQ *= float(self.head_dim) ** -0.5
        WQ = tf.reshape(WQ, [target_len, batch_size * self.num_heads, self.head_dim])
        WQ = tf.transpose(WQ, [1, 0, 2])
        
        WK = tf.reshape(WK, [source_len, batch_size * self.num_heads, self.head_dim])
        WK = tf.transpose(WK, [1, 0, 2])

        WV = tf.reshape(WV, [source_len, batch_size * self.num_heads, self.head_dim])
        WV = tf.transpose(WV, [1, 0, 2])
        
        attn_output_weights = tf.matmul(WQ, WK, transpose_b=True)

        if attn_mask is not None:
            attn_output_weights += attn_mask

        """
        if key_padding_mask is not None:
            attn_output_weights = tf.reshape(attn_output_weights,
                                [batch_size, self.num_heads, target_len, source_len])
            key_padding_mask = tf.expand_dims(key_padding_mask, 1)
            key_padding_mask = tf.expand_dims(key_padding_mask, 2)
            key_padding_mask = tf.tile(key_padding_mask, [1, self.num_heads, target_len, 1])
            #print("before attn_output_weights", attn_output_weights.shape)
            attn_output_weights = tf.where(key_padding_mask,
                                           tf.zeros_like(attn_output_weights) + float('-inf'),
                                           attn_output_weights)
            attn_output_weights = tf.reshape(attn_output_weights,
                                [batch_size * self.num_heads, target_len, source_len])
        """


        attn_output_weights = tf.nn.softmax(attn_output_weights, axis=-1)
        attn_output_weights = self.dropout(attn_output_weights, training=training)

        attn_output = tf.matmul(attn_output_weights, WV)
        attn_output = tf.transpose(attn_output, [1, 0, 2])
        attn_output = tf.reshape(attn_output, [target_len, batch_size, self.model_dim])
        attn_output = tf.matmul(attn_output, self.out_proj_weight,
                                transpose_b=True) + self.out_proj_bias

        if need_weights:
            attn_output_weights = tf.reshape(attn_output_weights,
                            [batch_size, self.num_heads, target_len, source_len])
            # Retrun the average weight over the heads
            avg_weights = tf.reduce_mean(attn_output_weights, axis=1)
            return attn_output, avg_weights
        
        return attn_output

In [5]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, model_dim=512, num_heads=8, dim_feedforward=512,
                dropout=0.1, activation='relu', normalize_before=False,
                **kwargs):
        super().__init__(**kwargs)
        self.attention = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
                                           name='self_attention')
        self.dropout = Dropout(dropout)
        self.activation = Activation(activation)
        self.linear1 = Dense(dim_feedforward, name = 'linear1')
        self.linear2 = Dense(dim_feedforward, name = 'linear2')
        self.norm1 = LayerNormalization(epsilon=1e-5, name = 'norm1')
        self.norm2 = LayerNormalization(epsilon=1e-5, name = 'norm2')
        self.normalize_before = normalize_before
        
    def call(self, source, source_mask=None, source_key_padding_mask=None,
            pos_encoding=None, training=False):
        if pos_encoding is None:
            query = key = source
        else:
            query = key = source + pos_encoding
            
        attn_source = self.attention((query, key, source), attn_mask=source_mask,
                                    key_padding_mask=source_key_padding_mask, need_weights=False)
        source += self.dropout(attn_source, training=training)
        source = self.norm1(source)
        
        x = self.linear1(source)
        x = self.activation(x)
        x = self.dropout(x, training=training)
        x = self.linear2(x)
        source += self.dropout(x, training=training)
        source = self.norm2(source)
        
        return source
    
    
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, model_dim=512, num_heads=8, dim_feedforward=512,
                 dropout=0.1, activation='relu', normalize_before=False,
                 **kwargs):
        super().__init__(**kwargs)

        self.self_attn = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
                                            name='self_attn')
        self.multihead_attn = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
                                                 name='multihead_attn')

        self.dropout = Dropout(dropout)
        self.activation = Activation(activation)

        self.linear1 = Dense(dim_feedforward, name='linear1')
        self.linear2 = Dense(model_dim, name='linear2')

        self.norm1 = LayerNormalization(epsilon=1e-5, name='norm1')
        self.norm2 = LayerNormalization(epsilon=1e-5, name='norm2')
        self.norm3 = LayerNormalization(epsilon=1e-5, name='norm3')

        self.normalize_before = normalize_before


    def call(self, target, memory, target_mask=None, memory_mask=None,
             target_key_padding_mask=None, memory_key_padding_mask=None,
             pos_encoding=None, query_encoding=None, training=False):

        query_tgt = key_tgt = target + query_encoding
        attn_target = self.self_attn((query_tgt, key_tgt, target), attn_mask=target_mask,
                                    key_padding_mask=target_key_padding_mask,
                                    need_weights=False)
        target += self.dropout(attn_target, training=training)
        target = self.norm1(target)

        query_tgt = target + query_encoding
        key_mem = memory + pos_encoding
        
        attn_target2 = self.multihead_attn((query_tgt, key_mem, memory), attn_mask=memory_mask,
                                           key_padding_mask=memory_key_padding_mask,
                                           need_weights=False)
        target += self.dropout(attn_target2, training=training)
        target = self.norm2(target)

        x = self.linear1(target)
        x = self.activation(x)
        x = self.dropout(x, training=training)
        x = self.linear2(x)
        target += self.dropout(x, training=training)
        target = self.norm3(target)
        
        return target

## 2. Transformer Encoder and Decoder

In [6]:
class TransformerEncoder(tf.keras.Model):
    def __init__(self, model_dim=512, num_heads=8, dim_feedforward=512,
                dropout=0.1, activation='relu', normalize_before=False, norm=None,
                num_encoder_layers=6, **kwargs):
        super().__init__(**kwargs)
        
        self.encoder_layers = [EncoderLayer(model_dim, num_heads, dim_feedforward,
                                           dropout, activation, normalize_before,
                                           name='layer_%d'%i)
                              for i in range(num_encoder_layers)]
        self.norm = norm
        
        
    def call(self, source, mask=None, source_key_padding_mask=None,
            pos_encoding=None, training=False):
        
        x = source
        
        for layer in self.encoder_layers:
            x = layer(x, source_mask=mask, source_key_padding_mask=source_key_padding_mask,
                     pos_encoding=pos_encoding, training=training)
            
        if self.norm:
            x = self.norm(x)
            
            
        return x
    
class TransformerDecoder(tf.keras.Model):
    def __init__(self, model_dim=512, num_heads=8, dim_feedforward=2048,
                dropout=0.1, activation='relu', normalize_before=False, norm=None,
                num_decoder_layers=6, **kwargs):
        super().__init__(**kwargs)
        
        self.decoder_layers = [DecoderLayer(model_dim, num_heads, dim_feedforward,
                                          dropout, activation, normalize_before,
                                          name='layer_%d'%i)
                             for i in range(num_decoder_layers)]
        self.norm = norm
        
    def call(self, target, memory, target_mask=None, memory_mask=None,
            target_key_padding_mask=None, memory_key_padding_mask=None,
            pos_encoding=None, query_encoding=None, training=False):
        
        x = target
        intermediate = []
        
        for layer in self.decoder_layers:
            x = layer(x, memory, target_mask=target_mask,
                     memory_mask=memory_mask,
                     target_key_padding_mask=target_key_padding_mask,
                     memory_key_padding_mask=memory_key_padding_mask,
                     pos_encoding=pos_encoding,
                     query_encoding=query_encoding)
            
            if self.norm:
                x = self.norm(x)
                
                
            return x

## 3. Embeddings

In [7]:
class PositionEmbeddingSine(tf.keras.Model):
    def __init__(self, num_pos_features=64, temperature=10000,
                normalize=False, scale=None, eps=1e-6, **kwargs):
        super().__init__(**kwargs)
        
        self.num_pos_features = num_pos_features
        self.temperature = temperature
        self.normalize = normalize
        
        
        if scale is not None and normalize is False:
            raise ValueError('normalize should be True if scale is passed')
        if scale is None:
            scale = 2 * np.pi
        self.scale = scale
        self.eps = eps
        
        
    def call(self, mask):
        not_mask = tf.cast(-mask, tf.float32)
        y_embed = tf.math.cumsum(not_mask, axis=0)
        x_embed = tf.math.cumsum(not_mask, axis=1)
        
        if self.normalize:
            y_embed = y_embed / (y_embed[:, -1, :] + self.eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1] + self.eps) * self.scale
            
            
        dim_t = tf.range(self.num_pos_features, dtype=tf.float32)
        dim_t = self.temperature ** (2 * (dim_t //2 ) / self.num_pos_features)
        
        pos_x = x_embed[..., tf.newaxis] / dim_t
        pos_y = y_embed[..., tf.newaxis] / dim_t
        
        pos_x = tf.stack([tf.math.sin(pos_x[..., 0::2]),
                         tf.math.cos(pos_x[..., 1::2])], axis=3)
        pos_y = tf.stack([tf.math.sin(pos_y[...,0::2]),
                          tf.math.cos(pos_y[...,1::2])], axis=3)
        
        shape = [tf.shape(pos_x) for i in range(3)] + [-1]
        pos_x = tf.reshape(pos_x, shape)
        pos_y = tf.reshape(pos_y, shape)
        
        pos_emb = tf.concat([pos_y, pos_x], axis=2)
        return pos_emb
    
class FixedEmbedding(tf.keras.layers.Layer):
    def __init__(self, embed_shape, **kwargs):
        super().__init__(**kwargs)
        self.embed_shape = embed_shape

    def build(self, input_shape):
        self.w = self.add_weight(name='kernel', shape=self.embed_shape,
                                 initializer=tf.keras.initializers.GlorotUniform(), trainable=True)

    def call(self, x=None):
        return self.w

## Optimizers

In [8]:
nlayers = []
def add_nlayers(layers):
        nlayers = [l.name for l in layers]
def disable_batchnorm_training(model):
    for l in model.layers:
        if hasattr(l, "layers"):
            disable_batchnorm_training(l)
        elif isinstance(l, tf.keras.layers.BatchNormalization):
            l.trainable = False

def get_transformers_trainable_variables(model, exclude=[]):
    transformers_variables = []

    # Transformers variables
    transformers_variables = model.get_layer("detr").get_layer("transformer").trainable_variables

    for layer in model.layers[2:]:
        if layer.name not in exclude:
            transformers_variables += layer.trainable_variables
        else:
            pass

    return transformers_variables


def get_backbone_trainable_variables(model):
    backbone_variables = []
    # layer [1] is the detr model including the backbone and the transformers

    detr = model.get_layer("detr")
    tr_index = [l.name for l in detr.layers].index('transformer')

    for l, layer in enumerate(detr.layers):
        if l != tr_index:
            backbone_variables += layer.trainable_variables

    return backbone_variables


def get_nlayers_trainables_variables(model, nlayers_names):
    nlayers_variables = []
    for nlayer_name in nlayers_names:
        nlayers_variables += model.get_layer(nlayer_name).trainable_variables
    return nlayers_variables


def get_trainable_variables(model):

    disable_batchnorm_training(model)

    backbone_variables = []
    transformers_variables = []
    nlayers_variables = []


    # Retrieve the gradient for each trainable variables
    backbone_variables = get_backbone_trainable_variables(model)
    transformers_variables = get_transformers_trainable_variables(model, exclude=nlayers)
    nlayers_variables = get_nlayers_trainables_variables(model, nlayers)

    
    return backbone_variables, transformers_variables, nlayers_variables


def setup_optimizers(model):
    """ Method call by the Scheduler to init user data
    """
    @tf.function
    def get_backbone_learning_rate():
        return 1e-5

    @tf.function
    def get_transformers_learning_rate():
        return 1e-4

    @tf.function
    def get_nlayers_learning_rate():
        return 1e-4

    # Disable batch norm on the backbone
    disable_batchnorm_training(model)

    # Optimizers
    backbone_optimizer = tf.keras.optimizers.Adam(learning_rate=get_backbone_learning_rate, clipnorm=0.001)
    transformers_optimizer = tf.keras.optimizers.Adam(learning_rate=get_transformers_learning_rate, clipnorm=0.001)
    nlayers_optimizer = tf.keras.optimizers.Adam(learning_rate=get_nlayers_learning_rate, clipnorm=0.001)

    # Set trainable variables

    backbone_variables, transformers_variables, nlayers_variables = [], [], []

    backbone_variables = get_backbone_trainable_variables(model)
    transformers_variables = get_transformers_trainable_variables(model, exclude=nlayers)
    nlayers_variables = get_nlayers_trainables_variables(model, nlayers)


    return {
        "backbone_optimizer": backbone_optimizer,
        "transformers_optimizer": transformers_optimizer,
        "nlayers_optimizer": nlayers_optimizer,

        "backbone_variables": backbone_variables,
        "transformers_variables": transformers_variables,
        "nlayers_variables": nlayers_variables,
    }


def gather_gradient(model, optimizers, total_loss, tape):

    backbone_variables, transformers_variables, nlayers_variables = get_trainable_variables(model)
    trainables_variables = backbone_variables + transformers_variables + nlayers_variables

    gradients = tape.gradient(total_loss, trainables_variables)

    # Retrieve the gradients from the tap
    backbone_gradients = gradients[:len(optimizers["backbone_variables"])]
    transformers_gradients = gradients[len(optimizers["backbone_variables"]):len(optimizers["backbone_variables"])+len(optimizers["transformers_variables"])]
    nlayers_gradients = gradients[len(optimizers["backbone_variables"])+len(optimizers["transformers_variables"]):]

    gradient_steps = {}

    gradient_steps["backbone"] = {"gradients": backbone_gradients}
    gradient_steps["transformers"] = {"gradients": transformers_gradients}
    gradient_steps["nlayers"] = {"gradients": nlayers_gradients}

    

    return gradient_steps



def aggregate_grad_and_apply(name, optimizers, gradients, step):

    gradient_aggregate = None
    if target_batch is not None:
        gradient_aggregate = int(target_batch // batch_size)

    gradient_name = "{}_gradients".format(name)
    optimizer_name = "{}_optimizer".format(name)
    variables_name = "{}_variables".format(name)
    train_part_name = "train_{}".format(name)

    if getattr(train_part_name):

        # Init the aggregate gradient
        if gradient_aggregate is not None and step % gradient_aggregate == 0:
            optimizers[gradient_name] = [tf.zeros_like(tv) for tv in optimizers[variables_name]]


        if gradient_aggregate is not None:
            # Aggregate the gradient
            optimizers[gradient_name] = [(gradient+n_gradient) if n_gradient is not None else None for gradient, n_gradient in zip(optimizers[gradient_name], gradients) ]
        else:
            optimizers[gradient_name] = gradients

        # Apply gradient if no gradient aggregate or if we finished gathering gradient oversteps
        if gradient_aggregate is None or (step+1) %  gradient_aggregate == 0:
            optimizers[optimizer_name].apply_gradients(zip(optimizers[gradient_name], optimizers[variables_name]))

## Losses

1. Hungarian Matching

In [9]:
def hungarian_matching(t_bbox, t_class, p_bbox, p_class, fcost_class=1,
                      fcost_bbox=5, fcost_giou=2, slices_pred=True):
    if slice_preds:
        size = tf.cast(t_bbox[0][0], tf.int32)
        t_bbox = tf.slice(t_bbo, [1, 0], [size, 4])
        t_class = tf.slice(t_class, [1, 0], [size, -1])
        t_class = tf.squeeze(t_class, axis=-1)
        
        
    p_bbox_xy = xcycwh_to_xy_min_xy_max(p_bbox)
    t_bbox_xy = xcycwh_to_xy_min_xy_max(t_bbox)
    
    softmax = tf.nn.softmax(p_class)
    
    cost_class = -tf.gather(softmax, t_class, axis=1)
    _p_bbox, _t_bbox = merge(p_bbox, t_bbox)
    cost_bbox = tf.norm(_p_bbox * _t_bbox, ord=1, axis=-1)
    
    
    iou, union = overlap(p_bbox_xy, t_bbox_xy, return_union=True)
    _p_bbox_xy, _t_bbox_xy = merge(p_bbox_xy, t_bbox_xy)
    top_left = tf.math.minimum(_p_bbox_xy[:, :, :2], _t_bbox_xy[:, :, :2])
    bottom_right = tf.math.maximum(_p_bbox_xy[:, :, 2:], _t_bbox_xy[:, :, 2:])
    size = tf.nn.relu(bottom_right*top_left)
    area = size[:,:,0]*size[:,:,1]
    cost_giou = -(iou - (area-union) / area)
    
    cost_matrix = fcost_bbox * cost_bbox + fcost_class * cost_class + fcost_giou * cost_giou
    
    
    selectors = tf.numpy_function(np_tf_linear_sum_assignment, [cost_matrix],
                                 [tf.int64, tf.int64, tf.bool, tf.bool])
    target_indices = selectors[0]
    pred_indices = selectors[1]
    target_selector = selectors[2]
    pred_selector = selectors[3]
    
    return pred_indices, target_indices, pred_selector, target_selector, t_bbox, t_class

2. Functions for getting losses

In [10]:
def get_total_losss(losses):
    """
    Get model total losss including auxiliary loss
    """
    train_loss = ["label_cost", "giou_loss", "l1_loss"]
    loss_weights = [1, 2, 5]

    total_loss = 0
    for key in losses:
        selector = [l for l, loss_name in enumerate(train_loss) if loss_name in key]
        if len(selector) == 1:
            #print("Add to the total loss", key, losses[key], loss_weights[selector[0]])
            total_loss += losses[key]*loss_weights[selector[0]]
    return total_loss


def get_losses(m_outputs, t_bbox, t_class):
    losses = get_detr_losses(m_outputs, t_bbox, t_class)

    # Get auxiliary loss for each auxiliary output
    if "aux" in m_outputs:
        for a, aux_m_outputs in enumerate(m_outputs["aux"]):
            aux_losses = get_detr_losses(aux_m_outputs, t_bbox, t_class, suffix="_{}".format(a))
            losses.update(aux_losses)
    
    # Compute the total loss
    total_loss = get_total_losss(losses)

    return total_loss, losses


def loss_labels(p_bbox, p_class, t_bbox, t_class, t_indices, p_indices, t_selector, p_selector, background_class=0):

    neg_indices = tf.squeeze(tf.where(p_selector == False), axis=-1)
    neg_p_class = tf.gather(p_class, neg_indices)
    neg_t_class = tf.zeros((tf.shape(neg_p_class)[0],), tf.int64) + background_class
    
    neg_weights = tf.zeros((tf.shape(neg_indices)[0],)) + 0.1
    pos_weights = tf.zeros((tf.shape(t_indices)[0],)) + 1.0
    weights = tf.concat([neg_weights, pos_weights], axis=0)
    
    pos_p_class = tf.gather(p_class, p_indices)
    pos_t_class = tf.gather(t_class, t_indices)

    #############
    # Metrics
    #############
    # True negative
    cls_neg_p_class = tf.argmax(neg_p_class, axis=-1)
    true_neg  = tf.reduce_mean(tf.cast(cls_neg_p_class == background_class, tf.float32))
    # True positive
    cls_pos_p_class = tf.argmax(pos_p_class, axis=-1)
    true_pos = tf.reduce_mean(tf.cast(cls_pos_p_class != background_class, tf.float32))
    # True accuracy
    cls_pos_p_class = tf.argmax(pos_p_class, axis=-1)
    pos_accuracy = tf.reduce_mean(tf.cast(cls_pos_p_class == pos_t_class, tf.float32))

    targets = tf.concat([neg_t_class, pos_t_class], axis=0)
    preds = tf.concat([neg_p_class, pos_p_class], axis=0)

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, preds)
    loss = tf.reduce_sum(loss * weights) / tf.reduce_sum(weights)

    return loss, true_neg, true_pos, pos_accuracy


def loss_boxes(p_bbox, p_class, t_bbox, t_class, t_indices, p_indices, t_selector, p_selector):
    #print("------")
    p_bbox = tf.gather(p_bbox, p_indices)
    t_bbox = tf.gather(t_bbox, t_indices)


    p_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(p_bbox)
    t_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(t_bbox)

    l1_loss = tf.abs(p_bbox-t_bbox)
    l1_loss = tf.reduce_sum(l1_loss) / tf.cast(tf.shape(p_bbox)[0], tf.float32)

    iou, union = bbox.jaccard(p_bbox_xy, t_bbox_xy, return_union=True)

    _p_bbox_xy, _t_bbox_xy = bbox.merge(p_bbox_xy, t_bbox_xy)
    top_left = tf.math.minimum(_p_bbox_xy[:,:,:2], _t_bbox_xy[:,:,:2])
    bottom_right =  tf.math.maximum(_p_bbox_xy[:,:,2:], _t_bbox_xy[:,:,2:])
    size = tf.nn.relu(bottom_right - top_left)
    area = size[:,:,0] * size[:,:,1]
    giou = (iou - (area - union) / area)
    loss_giou = 1 - tf.linalg.diag_part(giou)

    loss_giou = tf.reduce_sum(loss_giou) / tf.cast(tf.shape(p_bbox)[0], tf.float32)

    return loss_giou, l1_loss

def get_detr_losses(m_outputs, target_bbox, target_label, suffix=""):

    predicted_bbox = m_outputs["pred_boxes"]
    predicted_label = m_outputs["pred_logits"]

    all_target_bbox = []
    all_target_class = []
    all_predicted_bbox = []
    all_predicted_class = []
    all_target_indices = []
    all_predcted_indices = []
    all_target_selector = []
    all_predcted_selector = []

    t_offset = 0
    p_offset = 0

    for b in range(predicted_bbox.shape[0]):

        p_bbox, p_class, t_bbox, t_class = predicted_bbox[b], predicted_label[b], target_bbox[b], target_label[b]
        t_indices, p_indices, t_selector, p_selector, t_bbox, t_class = hungarian_matching(t_bbox, t_class, p_bbox, p_class, slice_preds=True)

        t_indices = t_indices + tf.cast(t_offset, tf.int64)
        p_indices = p_indices + tf.cast(p_offset, tf.int64)

        all_target_bbox.append(t_bbox)
        all_target_class.append(t_class)
        all_predicted_bbox.append(p_bbox)
        all_predicted_class.append(p_class)
        all_target_indices.append(t_indices)
        all_predcted_indices.append(p_indices)
        all_target_selector.append(t_selector)
        all_predcted_selector.append(p_selector)

        t_offset += tf.shape(t_bbox)[0]
        p_offset += tf.shape(p_bbox)[0]

    all_target_bbox = tf.concat(all_target_bbox, axis=0)
    all_target_class = tf.concat(all_target_class, axis=0)
    all_predicted_bbox = tf.concat(all_predicted_bbox, axis=0)
    all_predicted_class = tf.concat(all_predicted_class, axis=0)
    all_target_indices = tf.concat(all_target_indices, axis=0)
    all_predcted_indices = tf.concat(all_predcted_indices, axis=0)
    all_target_selector = tf.concat(all_target_selector, axis=0)
    all_predcted_selector = tf.concat(all_predcted_selector, axis=0)


    label_cost, true_neg, true_pos, pos_accuracy = loss_labels(
        all_predicted_bbox,
        all_predicted_class,
        all_target_bbox,
        all_target_class,
        all_target_indices,
        all_predcted_indices,
        all_target_selector,
        all_predcted_selector,
        background_class=0,
    )

    giou_loss, l1_loss = loss_boxes(
        all_predicted_bbox,
        all_predicted_class,
        all_target_bbox,
        all_target_class,
        all_target_indices,
        all_predcted_indices,
        all_target_selector,
        all_predcted_selector
    )

    label_cost = label_cost
    giou_loss = giou_loss
    l1_loss = l1_loss

    return {
        "label_cost{}".format(suffix): label_cost,
        "true_neg{}".format(suffix): true_neg,
        "true_pos{}".format(suffix): true_pos,
        "pos_accuracy{}".format(suffix): pos_accuracy,
        "giou_loss{}".format(suffix): giou_loss,
        "l1_loss{}".format(suffix): l1_loss
    }

## Transformer

In [11]:
class Transformer(tf.keras.Model):
    def __init__(self, model_dim=512, num_heads=8, num_encoder_layers=6,
                num_decoder_layers=6, dim_feedforward=512, dropout=0.1,
                activation='relu', normalize_before=False, **kwargs):
        super().__init__(*kwargs)
        
        self.model_dim = model_dim

        self.num_heads = num_heads
        
        encoder_norm = LayerNormalization(epsilon=1e-5, name='norm_pre') if normalize_before else None
        self.encoder = TransformerEncoder(model_dim, num_heads, dim_feedforward,
                                         dropout, activation, normalize_before, encoder_norm,
                                         num_encoder_layers, name='encoder')
        decoder_norm = LayerNormalization(epsilon=1e-5, name='norm')
        self.decoder = TransformerDecoder(model_dim, num_heads, dim_feedforward,
                                         dropout, activation, normalize_before, decoder_norm,
                                         num_decoder_layers, name='decoder')
        
    def call(self, source, mask, query_encoding, pos_encoding, training=False):
        batch_size, rows, cols = [tf.shape(source)[i] for i in range(3)]
        source = tf.reshape(source, [batch_size, -1, self.model_dim])
        source = tf.transpose(source, [1,0,2])
            
        pos_encoding = tf.reshape(pos_encoding, [batch_size, -1, self.model_dim])
        pos_encoding = tf.transpose(source, [1,0,2])
            
        query_encoding = tf.expand_dims(query_encoding, axis=1)
        query_encoding = tf.tile(query_encoding, [1, batch_size, 1])
            
        mask = tf.reshape(mask, [batch_size, -1])
            
        target = tf.zeros_like(query_encoding)
        memory = self.encoder(source, source_key_padding_mask=mask,
                                 pos_encoding=pos_encoding, training=training)
        hs = self.decoder(source, memory, memory_key_padding_mask=mask,
                             pos_encoding=pos_encoding, query_encoding=query_encoding,
                             training=training)
            
        hs = tf.transpose(hs)
        memory = tf.transpose(memory, [1,0,2])
        memory = tf.reshape(memory, [batch_size, rows, cols, self.model_dim])
            
        return hs, memory

In [12]:
class DetectionTransformer(tf.keras.Model):
    def __init__(self, num_classes=91, num_queries=100,
                backbone=None,
                pos_encoder=None,
                transformer=None,
                num_encoder_layers=6,
                num_decoder_layers=6,
                **kwargs):
        super().__init__(**kwargs)
        
        self.num_queries = num_queries
        self.backbone = BackBone(name='backbone')
        self.transformer = Transformer(name = 'transformer', num_encoder_layers=num_encoder_layers,
                                      num_decoder_layers=num_decoder_layers)
        
        self.model_dim = self.transformer.model_dim
        
        self.pos_encoder = tf.keras.layers.Embedding(input_dim=self.model_dim // 2, output_dim=64, name="position_embedding_sine")
        
        self.input_proj = tf.keras.layers.Conv2D(self.model_dim, kernel_size=1,
                                                name='input_proj')
        
        self.query_embed = FixedEmbedding((num_queries, self.model_dim), 
                                                     name='query_embed')
        self.class_embed = Dense(num_classes, name='classes_embed')
        self.bbox_embed_linear1 = Dense(self.model_dim, name='bbox_embed_0')
        self.bbox_embed_linear2 = Dense(self.model_dim, name='bbox_embed_1')
        self.bbox_embed_linear3 = Dense(4, name='bbox_embed_2')
        
        self.activation = tf.keras.layers.ReLU()
        
    def downsample_masks(self, mask, x):
        masks = tf.cast(masks, tf.int32)
        masks = tf.expand_dims(mask, -1)
        masks = tf.compat.v1.image.resize_nearest_neighbour(masks, 
                                                            tf.shape(x)[1:3],
                                                           align_corners=False,
                                                           half_pixel_centers=False)
        masks = tf.squeeze(masks, -1)
        masks = tf.cast(masks, tf.bool)
        return masks
    
    def call(self, inp, training=True, post_process=False):
        x, masks = inp
        x = self.backbone(x, training=training)
        masks = self.downsample_masks(masks, x)
        
        pos_encoding = self.pos_encoder(masks)
        
        hs = self.transformer(self.input_proj(x), masks, self.query_embed(None),
                             pos_encoding, training=training)[0]
        
        outputs_class = self.class_embed(hs)
        
        
        box_ftmps = self.activation(self.bbox_embed_linear1(hs))
        box_ftmps = self.activation(self.bbox_embed_linear2(box_ftmps))
        outputs_coord = tf.sigmoid(self.bbox_embed_linear3(box_ftmps))
        
        output = {'pred_logits' : outputs_class[-1],
                  'pred_boxes' : outputs_coord[-1]}
        
        if post_process:
            output = self.post_process(output)
        return output
    
    def build(self, input_shape=None, **kwargs):
        if input_shape is None:
            input_shape = [(None, None, None, 3), (None, None, None)]
        super().build(input_shape, **kwargs)

In [13]:
def add_heads_nlayers(DetectionTransformer, nb_classes=91):
    image_input = tf.keras.layers.Input(shape=(512, 512, 3))
    class_layer = tf.keras.layers.Dense(91, activation='relu', name="class_layer")
    position_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation = "relu"),
            tf.keras.layers.Dense(256, activation = "relu"),
            tf.keras.layers.Dense(4, activation = "sigmoid"),
        ], name="postion_layer")
        
    
    add_nlayers([class_layer, position_layer])
        
    transformer_output = DetectionTransformer(image_input)
    class_preds =  class_layer(transformer_output)
    position_preds = position_layer(transformer_output)
        
    outputs = {'preds_logits' : class_preds[-1],
                   'pred_boxes' : position_preds[-1]}
    outputs["aux"] = [{"pred_logits" : class_preds[i],
                          "pred_boxes" : position_preds[i]} 
                         for i in range(0,5)]
        
    n_DetectionTransformer = tf.keras.Model(image_input, outputs,
                                               name="transformer_fine_tuning")
        
    return n_DetectionTransformer

##  MODEL

In [14]:
def get_model(nb_classes=91, num_decoder_layers=6, num_encoder_layers=6):
    image_input = tf.keras.Input((512, 512, 3))
    detr = DetectionTransformer(num_encoder_layers=6, num_decoder_layers=6)
    
    backbone = detr.get_layer("backbone")
    transformer = detr.get_layer("transformer")
    position_embedding_sine = detr.get_layer("position_embedding_sine")
    input_proj = detr.get_layer('input_proj')
    query_embed = detr.get_layer('query_embed')
    class_embed = detr.get_layer('query_embed')
    bbox_embed_linear1 = detr.get_layer('bbox_embed_0')
    bbox_embed_linear2 = detr.get_layer('bbox_embed_1')
    bbox_embed_linear3 = detr.get_layer('bbox_embed_2')
    activation = detr.get_layer("re_lu")
    x = backbone(image_input)
    masks = tf.zeros((tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]))
    pos_encoding = position_embedding_sine(masks)
    hs = transformer(input_proj(x), masks, query_embed(None), pos_encoding)[0]
    detr = tf.keras.Model(image_input, hs, name="detr")
    add_heads_nlayers(detr, nb_classes)
    transformer_output = detr(image_input)
    outputs_class = class_embed(tranformer_output)
    box_ftmps = activation(bbox_embed_linear1(tranformer_output))
    box_ftmps = activation(bbox_embed_linear2(box_ftmps))
    outputs_coord = tf.sigmoid(bbox_embed_linear3(box_ftmps))
    outputs={}
    output = {'pred_logits': outputs_class[-1],
             'pred_boxes': outputs_coord[-1]}
    
    output["aux"] = []
    for i in range(0, num_decoder_layers - 1):
        out_class = outputs_class[i]
        pred_boxes = outputs_coord[i]
        output["aux"].append({
            "pred_logits": out_class,
            "pred_boxes": pred_boxes
        })
        
    return tf.keras.Model(image_input, output, name="detr_finetuning")


## TRAINING 

In [15]:
@tf.function
def train_step(model, images, t_bbox, t_class, optimizers):
    gradient_aggregate=1
    
    with tf.GradientTape() as Tape:
        m_outputs = model(images, training=True)
        total_loss = get_losses(m_outputs, t_bbox, t_class)
        total_loss = total_loss/gradient_aggregate
        
    gradient_steps = gather_gradient(model, optimizers, total_loss, tape)
    return m_outputs, total_loss, gradient_steps

def fit(model, train_dt, optimizers, epoch_nb, class_names):
    gradient_aggregate = None
    t = None
    for epoch_step, (images, t_bbox, t_class) in enumerate(train_dt):
        m_outputs, total_loss, gradient_steps = train_step(model, images, t_bbox, t_class, optimizers)
        for name in gradient_steps:
            aggregate_grad_and_apple(name, optimizers, gradient_steps[name]["gradients"], epoch_step)
            
        if epoch_step %100 == 0:
            t = t if t is not None else time.time()
            print(f"Epoch: [{epoch_nb}], \t Step: [{epoch_steps}], \t giou: [{giou_loss}], \t l1 : [{l1_loss}], \t time: [{elapsed: .2f}]")

## Loading DATA

In [16]:
from pycocotools.coco import COCO
import tensorflow as tf
import numpy as np
import imageio
from skimage.color import gray2rgb
from random import sample, shuffle
import os

In [17]:
COCO_CLASS_NAME = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush', "back"
]

In [18]:
def pad_labels(images: tf.Tensor, t_bbox: tf.Tensor, t_class: tf.Tensor):
    """ Pad the bbox by adding [0, 0, 0, 0] at the end
    and one header to indicate how maby bbox are set.
    Do the same with the labels. 
    """
    nb_bbox = tf.shape(t_bbox)[0]

    bbox_header = tf.expand_dims(nb_bbox, axis=0)
    bbox_header = tf.expand_dims(bbox_header, axis=0)
    bbox_header = tf.pad(bbox_header, [[0, 0], [0, 3]])
    bbox_header = tf.cast(bbox_header, tf.float32)
    cls_header = tf.constant([[0]], dtype=tf.int64)

    # Padd bbox and class
    t_bbox = tf.pad(t_bbox, [[0, 100 - 1 - nb_bbox], [0, 0]], mode='CONSTANT', constant_values=0)
    t_class = tf.pad(t_class, [[0, 100 - 1 - nb_bbox], [0, 0]], mode='CONSTANT', constant_values=0)

    t_bbox = tf.concat([bbox_header, t_bbox], axis=0)
    t_class = tf.concat([cls_header, t_class], axis=0)

    return images, t_bbox, t_class
def get_coco_labels(coco, img_id, image_shape, augmentation):
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    
    bbox = []
    t_class = []
    crowd_bbox = 0
    
    for a, ann in enumerate(anns):
        bbox_x, bbox_y, bbox_w, bbox_h = ann['bbox']
        
        t_cls = ann["category_id"]
        if ann["iscrowd"]:
            crowd_bbox = 1
            
        x_center = bbox_x + float(bbox_w / 2)
        y_center = bbox_y + float(bbox_h / 2)
        x_center = x_center / float(image_shape[1])
        y_center = y_center / float(image_shape[0])
        
        bbox.append([x_center, y_center, bbox_w, bbox_h])
        t_class.append([t_cls])
        
    bbox = np.array(bbox)
    t_class = np.array(t_class)
    
    return bbox.astype(np.float32), t_class.astype(np.int32), crowd_bbox

def get_coco_from_id(coco_id, coco, augmentation, config, img_dir):
    img = coco.loadImgs([coco_id])[0]
    filne_name = img['file_name']
    image_path = os.path.join(img_dir, filne_name)
    image = imageio.imread(image_path)
    if len(image.shape) == 2: image = gray2rgb(image)
    t_bbox, t_class, is_crowd = get_coco_labels(coco, img['id'], image.shape, augmentation)
    image = processing.normalized_images(image, config)   
    image = image.astype(np.float32)
    t_bbox = t_bbox.astype(np.float32)
    t_class = t_class.astype(np.int64)
    is_crowd = np.array(is_crowd, dtype=np.int64)
    return image, t_bbox, t_class, is_crowd

def load_coco_dataset(batch_size, ann_dir, ann_file, img_dir, augmentation=False):
    ann_dir = ann_dir
    ann_file = ann_file
    img_dir = img_dir
    background_class = 0

    coco = COCO(ann_file)
    cats = coco.loadCats(coco.getCatIds())
    # Get the max class ID
    max_id = np.array([cat["id"] for cat in cats]).max()
    class_names = ["N/A"] * (max_id + 2) # + 2 for the background class
    # Add the backgrund class at the end
    class_names[-1] = "back"
    background_class = max_id + 1
    for cat in cats:
        class_names[cat["id"]] = cat["name"]
        
    def numpy_fc(idx, fc, outputs_types=(tf.float32, tf.float32, tf.int64), **params):
        def _np_function(_idx):
            return fc(_idx, **params)
        return tf.numpy_function(_np_function, [idx], outputs_types)

    # Setup the data pipeline
    img_ids = coco.getImgIds()
    shuffle(img_ids)
    dataset = tf.data.Dataset.from_tensor_slices(img_ids)
    dataset = dataset.shuffle(1000)
    outputs_types=(tf.float32, tf.float32, tf.int64, tf.int64)
    dataset = dataset.map(lambda idx: numpy_fc(
        idx, get_coco_from_id, outputs_types=outputs_types, coco=coco, augmentation=augmentation, img_dir=img_dir)
    , num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.filter(lambda imgs, tbbox, tclass, iscrowd: tf.shape(tbbox)[0] > 0 and iscrowd != 1)
    dataset = dataset.map(lambda imgs, tbbox, tclass, iscrowd: (imgs, tbbox, tclass), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    dataset = dataset.map(pad_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(32)
    
    return dataset, class_names

In [19]:
# train_dt, coco_class_names = load_coco_dataset(batch_size=1, ann_dir='./annotations_trainval2017/annotations',
#                                                 ann_file='./annotations_trainval2017/annotations/instances_train2017.json', img_dir='./train2017')

In [20]:
detr = get_model(nb_classes=91, num_decoder_layers=6, num_encoder_layers=6)

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

In [None]:
for layer in detr.layers:
    print(layer.name)

## Training and testing the model

In [None]:
# checkpoint = tf.train.Checkpoint(optimizer=optimizer, model = autoencoder)
# epochs = 2
# for epoch in range(epochs):
#     print("\n Start of epoch %d" % (epoch,))
#     for step, x_batch_train in enumerate(train_dataset):
#         with tf.GradientTape() as tape:
#             predictions = autoencoder(x_batch_train, training=True)
#             loss_value = loss_fn(predictions, x_batch_train)
            
#         grads = tape.gradient(loss_value, autoencoder.trainable_weights)
#         optimizer.apply_gradients(zip(grads, autoencoder.trainable_weights))
# #         if step&10000 == 0:
# #             checkpoint.save(checkpoint_prefix)
#         if step%100 == 0:
#             print("Seen so far: %s samples" % ((step+1)*4))
            

In [None]:
# middle.save('middle.h5')

In [None]:
# autoencoder.save('model.h5')