# Ernesto Antonio Reyes Ramírez




# Spatially Supervised Recurrent Convolutional Neural Networks for Visual Object Tracking

# Guanghan Ning∗, Zhi Zhang, Chen Huang, Zhihai He
# Department of Electrical and Computer Engineering, University of missouri

*Parte del código y herramientas propiedad del autor Guanghan Ning.

## Introducción



En este artículo los autores proponen un nuevo modelo de tracking que supera a los mejores modelos de tracking en su momento. El modelo consiste en utilizar el modelo YOLO junto con una red recurrente (Recurrent YOLO) para así poder obtener un modelo que logre predecir la ubicación de un objeto a lo largo de un video. 

### Modelo YOLO

![alt text](yolo.png "yOLO")

Las predicciones en YOLO se codifican como $S\times S \times (B \times 5 + C)$, donde S es el número de recuadros en los que se divide, B es el número de cajas a predecir, y c la confianza. En este articulo se utiliza $S=7,B=2,C=20$.

### Modelo ROLO (Recurrent YOLO)

![alt text](img1.png "ROLO")

![alt text](rnn.png "ROLO")

![alt text](img2.png "ROLO")

# Entrenamiento

### El entrenamiento se hace en tres fases:

1. Se entren la red convolucional con Imagenet con 10000 clases.
2. Se toma un modelo pre-entrenado de YOLO.
3. Finalente, se entren el modelo recurrente completo con los videos de la base de datos OTB-30 utilizando la siguiente función:

![alt text](entreno.png "ROLO")

# Implementación de ROLO

In [4]:
#Librerias importantes
import cv2
import os
import numpy as np
import sys
import os.path
import time
import random
import ROLO_utils as utils
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell

In [5]:
tf.compat.v1.disable_eager_execution()

### Modelo ROLO

In [8]:
class ROLO_TF:
    disp_console = True
    restore_weights = True#False

    # YOLO parameters
    fromfile = None
    tofile_img = 'test/output.jpg'
    tofile_txt = 'test/output.txt'
    imshow = True
    filewrite_img = False
    filewrite_txt = False
    disp_console = True
    yolo_weights_file = 'weights/YOLO_small.ckpt'
    alpha = 0.1
    threshold = 0.2
    iou_threshold = 0.5
    num_class = 20
    num_box = 2
    grid_size = 7
    classes =  ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"]
    w_img, h_img = [352, 240]

    # ROLO Network Parameters
    rolo_weights_file = 'model_demo.ckpt'
    lstm_depth = 3
    num_steps = 3  # number of frames as an input sequence
    num_feat = 4096
    num_predict = 6 # final output of LSTM 6 loc parameters
    num_gt = 4
    num_input = num_feat + num_predict # data input: 4096+6= 4102

    # ROLO Parameters
    batch_size = 1
    display_step = 1

    # tf Graph input
    x = tf.compat.v1.placeholder("float32", [batch_size, num_steps, num_input])
    print(x.shape)
    istate = tf.compat.v1.placeholder("float32", [batch_size, num_input]) #state & cell => 2x num_input
    y = tf.compat.v1.placeholder("float32", [batch_size, num_gt])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random.normal([num_input, num_predict]))
    }
    biases = {
        'out': tf.Variable(tf.random.normal([num_predict]))
    }


    def __init__(self,argvs = []):
        print("ROLO init")
        self.ROLO(argvs)

    
    @tf.function
    def LSTM_single(self, name,  _X, _istate, weights, biases):

        # input shape: (batch_size, n_steps, n_input)
        _X = tf.transpose(_X, perm=[1, 0, 2])  # permute num_steps and batch_size
        # Reshape to prepare input to hidden activation
        _X = tf.reshape(_X, [self.num_steps * self.batch_size, self.num_input]) # (num_steps*batch_size, num_input)
        # Split data because rnn cell needs a list of inputs for the RNN inner loop
        _X = tf.split(_X, self.num_steps, axis=0) # n_steps * (batch_size, num_input)
        cell = tf.compat.v1.nn.rnn_cell.LSTMCell(self.num_input, self.num_input)
        state = _istate
        for step in range(self.num_steps):   
            outputs, state = tf.compat.v1.nn.static_rnn(cell, inputs=[_X[step]], initial_state=state)
            tf.get_variable_scope().reuse_variables()
        return outputs


    def build_networks(self):
        if self.disp_console : print ("Building ROLO graph...")

        # Build rolo layers
        self.lstm_module = self.LSTM_single('lstm_test', self.x, self.istate, self.weights, self.biases)
        self.ious= tf.Variable(tf.zeros([self.batch_size]), name="ious")
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        self.saver = tf.train.Saver()
        if self.disp_console : print ("Loading complete!" + '\n')


    def testing(self, x_path, y_path):
        total_loss = 0

        print("TESTING ROLO...")
        # Use rolo_input for LSTM training
        pred = self.LSTM_single('lstm_train', self.x, self.istate, self.weights, self.biases)
        print("pred: ", pred)
        self.pred_location = pred[0][:, 4097:4101]
        print("pred_location: ", self.pred_location)
        print("self.y: ", self.y)

        self.correct_prediction = tf.square(self.pred_location - self.y)
        print("self.correct_prediction: ", self.correct_prediction)
        self.accuracy = tf.reduce_mean(self.correct_prediction) * 100
        print("self.accuracy: ", self.accuracy)
        #optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.accuracy) # Adam Optimizer

        # Initializing the variables
        init = tf.initialize_all_variables()

        # Launch the graph
        with tf.Session() as sess:

            if (self.restore_weights == True):
                sess.run(init)
                self.saver.restore(sess, self.rolo_weights_file)
                print ("Loading complete!" + '\n')
            else:
                sess.run(init)

            id = 0 #iterador auxiliar

            
            while id < self.testing_iters - self.num_steps:
                # cargar datos & ground truth
                batch_xs = self.rolo_utils.load_yolo_output_test(x_path, self.batch_size, self.num_steps, id) # [num_of_examples, num_input] (depth == 1)


                batch_ys = self.rolo_utils.load_rolo_gt_test(y_path, self.batch_size, self.num_steps, id)
                print("Batch_ys_initial: ", batch_ys)
                batch_ys = utils.locations_from_0_to_1(self.w_img, self.h_img, batch_ys)


                # Reshape data to get 3 seq of 5002 elements
                batch_xs = np.reshape(batch_xs, [self.batch_size, self.num_steps, self.num_input])
                batch_ys = np.reshape(batch_ys, [self.batch_size, 4])
                print("Batch_ys: ", batch_ys)

                pred_location= sess.run(self.pred_location,feed_dict={self.x: batch_xs, self.y: batch_ys, self.istate: np.zeros((self.batch_size, 2*self.num_input))})
                print("ROLO Pred: ", pred_location)
                print("ROLO Pred in pixel: ", pred_location[0][0]*self.w_img, pred_location[0][1]*self.h_img, pred_location[0][2]*self.w_img, pred_location[0][3]*self.h_img)

                # Save pred_location to file
                utils.save_rolo_output_test(self.output_path, pred_location, id, self.num_steps, self.batch_size)

                #sess.run(optimizer, feed_dict={self.x: batch_xs, self.y: batch_ys, self.istate: np.zeros((self.batch_size, 2*self.num_input))})

                if id % self.display_step == 0:
                    # Calculate batch loss
                    loss = sess.run(self.accuracy, feed_dict={self.x: batch_xs, self.y: batch_ys, self.istate: np.zeros((self.batch_size, 2*self.num_input))})
                    print ("Iter " + str(id*self.batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss)) #+ "{:.5f}".format(self.accuracy)
                    total_loss += loss
                id += 1
                print(id)

            print ("Testing Finished!")
            avg_loss = total_loss/id
            print ("Avg loss: " + str(avg_loss))

        return None

    def ROLO(self, argvs):

            self.rolo_utils= utils.ROLO_utils()
            self.rolo_utils.loadCfg()
            self.params = self.rolo_utils.params

            arguments = self.rolo_utils.argv_parser(argvs)

            if self.rolo_utils.flag_train is True:
                self.training(utils.x_path, utils.y_path)
            elif self.rolo_utils.flag_track is True:
                self.build_networks()
                self.track_from_file(utils.file_in_path)
            elif self.rolo_utils.flag_detect is True:
                self.build_networks()
                self.detect_from_file(utils.file_in_path)
            else:
                print ("Default: running ROLO test.")
                self.build_networks()

                evaluate_st = 0
                evaluate_ed = 29

                for test in range(evaluate_st, evaluate_ed + 1):

                    [self.w_img, self.h_img, sequence_name, dummy_1, self.testing_iters] = utils.choose_video_sequence(test)

                    x_path = 'DATA' + sequence_name + 'yolo_out/'
                    y_path = 'DATA' + sequence_name + 'groundtruth_rect.txt'
                    self.output_path = 'DATA' + sequence_name + 'rolo_out_test/'
                    utils.createFolder(self.output_path)

                    self.rolo_weights_file= 'model_demo.ckpt'

                    self.num_steps = 3  # number of frames as an input sequence
                    print("TESTING ROLO on video sequence: ", sequence_name)
                    self.testing(x_path, y_path)

(1, 3, 4102)


In [9]:
tf.compat.v1.reset_default_graph()

In [None]:
ROLO_TF()

# Prueba del modelo con los videos de OBT-30

In [187]:
num_steps= 6
test = 5 #número del video sobre el cual probar

[wid, ht, sequence_name, dummy_1, dummy_2] = utils.choose_video_sequence(test) #información del video

In [188]:
# Carpetas donde se encuentra la información del video
img_fold_path = "DATA/" + sequence_name + "/img/" 
gt_file_path= "DATA/" + sequence_name + "/groundtruth_rect.txt"  
yolo_out_path= "DATA/" + sequence_name + "/yolo_out/"
rolo_out_path= "DATA/" + sequence_name + "/rolo_out_test/" 

In [189]:
# Cargamos los datos
paths_imgs = utils.load_folder( img_fold_path)
paths_rolo= utils.load_folder( rolo_out_path)
lines = utils.load_dataset_gt( gt_file_path)

In [190]:
# Creamos el objeto VideoWriter
fourcc= cv2.VideoWriter_fourcc(*'DIVX')
video_name = sequence_name + '_test.avi'
video_path =  "output/videos/" + video_name
video = cv2.VideoWriter(video_path, fourcc, 20, (wid, ht))

### Notación

1. $[x,y,w,h]$ es como se va representar a la caja que encierra (posiblemente) al objeto, donde $(x,y)$ es el centro de la caja, w su ancho y h su altura. 

2. Ground Truth = gt = "La localización real de la caja que encierra al objeto".

3. Utilizamos la medida Intersection over Union denominada por IOU.

In [191]:
#Comenzamos con la prueba 
total= 0
rolo_avgloss= 0 #promedio de perdida para ROLO
yolo_avgloss= 0 #promedio de perdida para YOLO

print(sequence_name + "\n")

for i in range(len(paths_rolo)- num_steps):
    id= i + 1
    test_id= id + num_steps - 2  #* num_steps + 1

    path = paths_imgs[test_id]
    img = utils.file_to_img(path)

    if(img is None): break
        

    print('gt: ' + str(test_id) + "\n")
    
    #yolo
    yolo_location= utils.find_yolo_location(yolo_out_path, test_id)
    yolo_location= utils.locations_normal( wid, ht, yolo_location)
    print("YOLO")
    print("x = " + str(yolo_location[0]) + "," + "y = " + str(yolo_location[1]) + "," + "w = " + str(yolo_location[2]) + "," + "h = " + str(yolo_location[3]) + "\n")

    
    #rolo
    rolo_location= utils.find_rolo_location( rolo_out_path, test_id)
    rolo_location = utils.locations_normal( wid, ht, rolo_location)
    print("ROLO")
    print("x = " + str(rolo_location[0]) + "," + "y = " + str(rolo_location[1]) + "," + "w = " + str(rolo_location[2]) + "," + "h = " + str(rolo_location[3]) + "\n")

    
    #ground_truth
    gt_location = utils.find_gt_location( lines, test_id - 1)
    
    print("Ground Truth")
    print("x = " + str(gt_location[0]) + "," + "y = " + str(gt_location[1]) + "," + "w = " + str(gt_location[2]) + "," + "h = " + str(gt_location[3]) + "\n")
    print(100*"-")

    frame = utils.debug_3_locations( img, gt_location, yolo_location, rolo_location)
    video.write(frame)

    utils.createFolder('output/frames/' + sequence_name)
    frame_name= 'output/frames/' + sequence_name + "/" + str(test_id)+'.jpg'
    #print(frame_name)
    cv2.imwrite(frame_name, frame) #Mostramos el frame 

    rolo_loss = utils.cal_rolo_IOU(rolo_location, gt_location)
    rolo_avgloss += rolo_loss
    yolo_loss=  utils.cal_yolo_IOU(yolo_location, gt_location)
    yolo_avgloss += yolo_loss
    total += 1

rolo_avgloss /= total
yolo_avgloss /= total
print("Yolo average IOU = ", yolo_avgloss)
print("Rolo average IOU = ", rolo_avgloss)
video.release()
cv2.destroyAllWindows()

Dog

gt: 5

YOLO
x = 83.4523696899414,y = 116.72435760498047,w = 71.2067642211914,h = 67.12134552001953

ROLO
x = 100.607605,y = 111.24709,w = 59.79282,h = 43.320858

Ground Truth
x = 73,y = 91,w = 49,h = 35

----------------------------------------------------------------------------------------------------
gt: 6

YOLO
x = 85.21809387207031,y = 116.63056945800781,w = 73.57920837402344,h = 71.19841766357422

ROLO
x = 102.30512,y = 110.253456,w = 58.556835,h = 42.26599

Ground Truth
x = 74,y = 93,w = 49,h = 32

----------------------------------------------------------------------------------------------------
gt: 7

YOLO
x = 85.37262725830078,y = 117.5562973022461,w = 76.59344482421875,h = 69.40432739257812

ROLO
x = 100.52646,y = 110.711205,w = 56.65702,h = 41.018684

Ground Truth
x = 75,y = 93,w = 47,h = 34

----------------------------------------------------------------------------------------------------
gt: 8

YOLO
x = 84.2188949584961,y = 119.24789428710938,w = 71.442138671875,h

gt: 33

YOLO
x = 81.50782012939453,y = 123.03397369384766,w = 63.95811462402344,h = 66.85452270507812

ROLO
x = 77.11859,y = 118.52984,w = 49.366272,h = 38.54873

Ground Truth
x = 62,y = 100,w = 40,h = 31

----------------------------------------------------------------------------------------------------
gt: 34

YOLO
x = 83.7504653930664,y = 124.32530975341797,w = 57.605777740478516,h = 63.19535446166992

ROLO
x = 69.635284,y = 117.8116,w = 48.059265,h = 39.175972

Ground Truth
x = 62,y = 103,w = 36,h = 30

----------------------------------------------------------------------------------------------------
gt: 35

YOLO
x = 85.52249145507812,y = 126.81935119628905,w = 54.29682922363281,h = 58.456356048583984

ROLO
x = 73.09582,y = 119.29919,w = 46.030838,h = 40.358994

Ground Truth
x = 62,y = 101,w = 38,h = 35

----------------------------------------------------------------------------------------------------
gt: 36

YOLO
x = 88.48023986816405,y = 128.84613037109375,w = 61.74460601806

gt: 61

YOLO
x = 114.2789077758789,y = 138.7799835205078,w = 38.18206787109375,h = 52.664005279541016

ROLO
x = 108.53152,y = 123.415764,w = 39.40199,h = 31.475094

Ground Truth
x = 96,y = 113,w = 30,h = 24

----------------------------------------------------------------------------------------------------
gt: 62

YOLO
x = 116.60482025146486,y = 139.48690795898438,w = 39.8046760559082,h = 53.18143081665039

ROLO
x = 109.24338,y = 123.30365,w = 38.94488,h = 32.43729

Ground Truth
x = 98,y = 113,w = 29,h = 26

----------------------------------------------------------------------------------------------------
gt: 63

YOLO
x = 118.03151702880861,y = 138.7025146484375,w = 33.2087287902832,h = 53.72489547729492

ROLO
x = 109.475655,y = 123.744835,w = 39.487488,h = 32.70274

Ground Truth
x = 98,y = 112,w = 30,h = 27

----------------------------------------------------------------------------------------------------
gt: 64

YOLO
x = 120.65818786621094,y = 139.8528289794922,w = 37.1115684509

gt: 89

YOLO
x = 190.19937133789062,y = 118.87259674072266,w = 74.16229248046875,h = 65.0979232788086

ROLO
x = 180.04422,y = 119.20127,w = 26.243694,h = 32.21131

Ground Truth
x = 165,y = 108,w = 25,h = 27

----------------------------------------------------------------------------------------------------
gt: 90

YOLO
x = 191.92199707031247,y = 114.938720703125,w = 72.56987762451172,h = 68.8593978881836

ROLO
x = 183.08955,y = 119.5381,w = 27.728777,h = 32.378162

Ground Truth
x = 166,y = 107,w = 26,h = 27

----------------------------------------------------------------------------------------------------
gt: 91

YOLO
x = 192.72476196289062,y = 117.30052947998047,w = 77.41567993164062,h = 67.60790252685547

ROLO
x = 188.15997,y = 116.04431,w = 27.37529,h = 31.62453

Ground Truth
x = 169,y = 104,w = 28,h = 30

----------------------------------------------------------------------------------------------------
gt: 92

YOLO
x = 194.7666015625,y = 117.34569549560547,w = 69.6097183227539

gt: 117

YOLO
x = 261.7933349609375,y = 96.11685943603516,w = 44.231658935546875,h = 73.49740600585938

ROLO
x = 240.23517,y = 121.54474,w = 46.32375,h = 30.821922

Ground Truth
x = 237,y = 107,w = 30,h = 25

----------------------------------------------------------------------------------------------------
gt: 118

YOLO
x = 263.96246337890625,y = 95.98726654052734,w = 45.023895263671875,h = 74.84310913085938

ROLO
x = 240.37683,y = 121.47251,w = 45.030575,h = 31.015327

Ground Truth
x = 237,y = 109,w = 32,h = 25

----------------------------------------------------------------------------------------------------
gt: 119

YOLO
x = 265.2312927246094,y = 95.50464630126953,w = 44.522674560546875,h = 74.01004028320312

ROLO
x = 243.91638,y = 122.36093,w = 46.040066,h = 30.968063

Ground Truth
x = 241,y = 109,w = 33,h = 26

----------------------------------------------------------------------------------------------------
gt: 120

YOLO
x = 266.2236328125,y = 95.92918395996094,w = 44.75238