In [1]:
from tensorflow.keras.layers import Input, Flatten, concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Conv2D, UpSampling2D
from tensorflow.keras.layers import Reshape
import os
from os.path import join
import sys
import cv2
import numpy as np

In [2]:
# parameters (no need to edit)
t, c, w, h = 16, 3, 112, 112
upsample = 4

In [3]:
def getCoarse2FineModel(summary=True):

    # defined input
    videoclip_cropped = Input((t, h, w, c), name='input1')
    videoclip_original = Input((t, h, w, c), name='input2')
    last_frame_bigger = Input(( h*upsample, w*upsample, c), name='input3')
  

    # coarse saliency model
    coarse_saliency_model = Sequential()
    coarse_saliency_model.add(Conv3D(64, [3, 3, 3], activation='relu', padding='same', name='conv1', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), padding='valid', name='pool1',data_format='channels_last'))
    coarse_saliency_model.add(Conv3D(128, [3, 3, 3], activation='relu', padding='same', name='conv2', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool2',data_format='channels_last'))
    coarse_saliency_model.add(Conv3D(256, [3, 3, 3], activation='relu', padding='same', name='conv3a', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(Conv3D(256, [3, 3, 3], activation='relu', padding='same', name='conv3b', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool3',data_format='channels_last'))
    coarse_saliency_model.add(Conv3D(512, [3, 3, 3], activation='relu', padding='same', name='conv4a', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(Conv3D(512, [3, 3, 3], activation='relu', padding='same', name='conv4b', strides=(1, 1, 1), data_format='channels_last'))
    coarse_saliency_model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(4, 2, 2), padding='valid', name='pool4',data_format='channels_last'))
    coarse_saliency_model.add(Reshape((512, 7, 7)))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(256, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))
    coarse_saliency_model.add(UpSampling2D(size=(2, 2), data_format='channels_last'))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(128, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))
    coarse_saliency_model.add(UpSampling2D(size=(2, 2), data_format='channels_last'))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(64, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))
    coarse_saliency_model.add(UpSampling2D(size=(2, 2), data_format='channels_last'))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(32, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))
    coarse_saliency_model.add(UpSampling2D(size=(2, 2), data_format='channels_last'))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(16, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))
    coarse_saliency_model.add(BatchNormalization())
    coarse_saliency_model.add(Conv2D(1, [3, 3], kernel_initializer='glorot_uniform', padding='same',data_format='channels_last'))
    coarse_saliency_model.add(LeakyReLU(alpha=.001))

    # loss on cropped image
    coarse_saliency_cropped = coarse_saliency_model(videoclip_cropped)
    cropped_output = Flatten(name='cropped_output')(coarse_saliency_cropped)

    # coarse-to-fine saliency model and loss
    coarse_saliency_original = coarse_saliency_model(videoclip_original)

    x = UpSampling2D((upsample, upsample), name='coarse_saliency_upsampled', data_format='channels_last')(coarse_saliency_original)  # 112 x 4 = 448
    
    x = concatenate([x, last_frame_bigger],axis=1)  # merge the last RGB frame

    x = Conv2D(32, [3, 3], padding='same', kernel_initializer='he_normal',data_format='channels_last')(x)
    x = Conv2D(64, [3, 3], padding='same', kernel_initializer='he_normal',data_format='channels_last')(x)
    x = LeakyReLU(alpha=.001)(x)
    x = Conv2D(32, [3, 3], padding='same', kernel_initializer='he_normal', data_format='channels_last')(x)
    x = LeakyReLU(alpha=.001)(x)
    x = Conv2D(32, [3, 3], padding='same', kernel_initializer='he_normal' ,data_format='channels_last')(x)
    x = LeakyReLU(alpha=.001)(x)
    x = Conv2D(16, [3, 3], padding='same', kernel_initializer='he_normal', data_format='channels_last')(x)
    x = LeakyReLU(alpha=.001)(x)
    x = Conv2D(4, [3, 3], padding='same', kernel_initializer='he_normal', data_format='channels_last')(x)
    x = LeakyReLU(alpha=.001)(x)

    fine_saliency_model = Conv2D(1, [3, 3], padding='same', activation='relu', data_format='channels_last')(x)

    # loss on full image
    full_fine_output = Flatten(name='full_fine_output')(fine_saliency_model)

    final_model = Model(inputs=[videoclip_cropped, videoclip_original, last_frame_bigger],
                        outputs=[cropped_output, full_fine_output])

    if summary:
        print (final_model.summary())

    return final_model

In [4]:
def predict_video(model, folder_in, output_path, mean_frame_path):

    # load frames to predict
    frames = []
    frame_list = os.listdir(folder_in)
    mean_frame = cv2.imread(mean_frame_path)
    print(mean_frame.shape)
    for frame_name in frame_list:
        frame = cv2.imread(join(folder_in, frame_name))
        frames.append(frame.astype(np.float32) - mean_frame)
    print ('Done loading frames.')

    # start of prediction
    for i in range(t, len(frames)):

        sys.stdout.write('\r{0}: predicting on frame {1:06d}...'.format(folder_in, i))

        # loading videoclip of t frames
        x = np.array(frames[i - t: i])

        x_last_bigger = cv2.resize(x[-1, :, :, :], (h*upsample,w*upsample))
        x_last_bigger = x_last_bigger.transpose(2, 0, 1)
        x_last_bigger = x_last_bigger[None, :]

        x = np.array([cv2.resize(f, (h, w)) for f in x])
        x = x[None, :]
        x = x.transpose(0, 4, 1, 2, 3).astype(np.float32)

        # predict attentional map on last frame of the videoclip
        res = model.predict_on_batch([x, x, x_last_bigger])
        res = res[1]  # keep only fine output
        res = np.clip(res, a_min=0, a_max=255)

        # normalize attentional map between 0 and 1
        res_norm = ((res / res.max()) * 255).astype(np.uint8)
        res_norm = np.reshape(res_norm, (h*upsample,w*upsample))

        cv2.imwrite(join(output_path, '{0:06d}.png'.format(i)), res_norm)

In [5]:
from utils import getCoarse2FineModel, predict_video
from tensorflow.keras.optimizers import Adam

if __name__ == '__main__':
    
    mean_frame = cv2.imread('data_sample/dreyeve_mean_frame.png')
    print(mean_frame.shape)
    
    output_dir_root = 'out'
    weights_file = 'weights/model_weights.h5'
    dreyeve_data_dir = 'data_sample/54'

    # load model for prediction
    model = getCoarse2FineModel(summary=True)
    opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=opt,
                  loss={'cropped_output': 'mse', 'full_fine_output': 'mse'},
                  loss_weights={'cropped_output': 1.0, 'full_fine_output': 1.0})

    # load pre-trained weights
    #model.load_weights(weights_file)

    # predict on sample data (first 200 frames of run 54 from DR(eye)VE
    #predict_video(model, dreyeve_data_dir,
    #              output_path=output_dir_root,
    #              mean_frame_path='data_sample/dreyeve_mean_frame.png')


(1080, 1920, 3)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             [(None, 3, 16, 112,  0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 3, 16, 112,  0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 1, 112, 112)  15072421    input1[0][0]                     
                                                                 input2[0][0]                     
__________________________________________________________________________________________________
coarse_saliency_upsampled (UpSa (None, 1, 448, 448)  0           sequential[1]