In [None]:
from __future__ import division
from six.moves import xrange
from sklearn.metrics import average_precision_score

import tensorflow as tf
import numpy as np
import time
import pickle
import operator
import os
import io
import random
import re
from tensorflow.python.framework import dtypes
from tensorflow.python.ops.nn import dropout as drop
#from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib import rnn
from math import sqrt
import random
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
#keras
import heapq
import random
from tqdm import tqdm,trange
import pandas as pd
import gensim
import keras.backend.tensorflow_backend as KTF
import keras.backend as K
from keras.layers import Conv2D, MaxPooling2D, Flatten,concatenate,multiply,RepeatVector,Dot,Activation,Reshape
from keras.layers import Input, LSTM, Embedding, Dense,TimeDistributed,CuDNNGRU,CuDNNLSTM,GRU,LSTM,Lambda,Dropout,Bidirectional
from keras.regularizers import l2
from keras.models import Model, Sequential
from keras.preprocessing import image
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,Callback
from keras_tqdm import TQDMNotebookCallback
from keras.optimizers import Adam,Adadelta

from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.utils import multi_gpu_model

In [None]:
#shape=(length, 256)，tf.float = 32
def htanh(input_tensor,units=256):
    tanh = Dense(units,activation='tanh')(input_tensor)
    sigmoid = Dense(units,activation='sigmoid')(input_tensor)
    return multiply([tanh,sigmoid])

In [None]:
#top_down_attention
def top_down_attention(image_vec,encoded_vec):
    K_size,image_dim = image_vec.shape[1].value,image_vec.shape[-1].value
    repet_vec = RepeatVector(K_size)(encoded_vec)
    concat_vec = concatenate([image_vec,repet_vec])
    attention_temp = htanh(concat_vec,512)
    attention_a = Dense(1, activation=None)(attention_temp)
    attention_weight = Activation(activation='softmax')(attention_a)
    attention_output = Dot(axes=1)([attention_weight, image_vec])
    attention_output = Reshape([image_dim])(attention_output)
    return attention_output

In [None]:
def calculate_IoU(i0, i1):
    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
    return iou

def calculate_nIoL(base, sliding_clip):
    inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))
    inter_l = inter[1]-inter[0]
    length = sliding_clip[1]-sliding_clip[0]
    nIoL = 1.0*(length-inter_l)/length
    return nIoL

In [None]:
#word embedding
PAD_IDENTIFIER = '<pad>'
UNK_IDENTIFIER = '<unk>' # <unk> is the word used to identify unknown words
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
wordembed_params = './word_embedding/embed_matrix.npy'
embedding_mat = np.load(wordembed_params)
vocab_file = './word_embedding/vocabulary_72700.txt'
T=10#MAX_WORDS = 10

def load_vocab_dict_from_file(dict_file, pad_at_first=True):
            with io.open(dict_file, encoding='utf-8') as f:
                words = [w.strip() for w in f.readlines()]
            if pad_at_first and words[0] != '<pad>':
                raise Exception("The first word needs to be <pad> in the word list.")
            vocab_dict = {words[n]: n for n in range(len(words))}
            return vocab_dict

def sentence2vocab_indices(sentence, vocab_dict):
            if isinstance(sentence, bytes):
                sentence = sentence.decode()
            words = SENTENCE_SPLIT_REGEX.split(sentence.strip())
            words = [w.lower() for w in words if len(w.strip()) > 0]
            # remove .
            if len(words) > 0 and (words[-1] == '.' or words[-1] == '?'):
                words = words[:-1]
            vocab_indices = [(vocab_dict[w] if w in vocab_dict else vocab_dict[UNK_IDENTIFIER])
                             for w in words]
            return vocab_indices

def preprocess_vocab_indices(vocab_indices, vocab_dict, T):
            # Truncate long sentences
            if len(vocab_indices) > T:
                vocab_indices = vocab_indices[:T]
            # Pad short sentences at the beginning with the special symbol '<pad>' 
            if len(vocab_indices) < T:
                vocab_indices = [vocab_dict[PAD_IDENTIFIER]] * (T - len(vocab_indices)) + vocab_indices
            return vocab_indices

def preprocess_sentence(sentence, vocab_dict, T):
            vocab_indices = sentence2vocab_indices(sentence, vocab_dict)
            return preprocess_vocab_indices(vocab_indices, vocab_dict, T)

vocab_dict = load_vocab_dict_from_file(vocab_file)

In [None]:
class TrainingDataSet(object):
    def __init__(self, it_path, batch_size,sliding_dir,train_fast_rcnn_path):

        self.batch_size = batch_size
        self.context_size = 128
        self.context_num = 1
        self.visual_feature_dim=4096
        self.clip_sentence_pairs_iou=pickle.load(open(it_path,'rb'))
        self.sliding_clip_path=sliding_dir
        #train_fast_rcnn_path
        self.train_fast_rcnn_path = train_fast_rcnn_path
        
        self.num_samples_iou=len(self.clip_sentence_pairs_iou)
        print (str(len(self.clip_sentence_pairs_iou))+" iou clip-sentence pairs are readed")

        
    def get_context_window(self, clip_name, win_length):
        movie_name = clip_name.split("_")[0]
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].split(".")[0])
        clip_length = self.context_size
        left_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
        right_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
        last_left_feat = np.load(self.sliding_clip_path+clip_name)
        last_right_feat = np.load(self.sliding_clip_path+clip_name)
        for k in range(win_length):
            left_context_start = start-clip_length*(k+1)
            left_context_end = start-clip_length*k
            right_context_start = end+clip_length*k
            right_context_end = end+clip_length*(k+1)
            left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
            right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
            if os.path.exists(self.sliding_clip_path+left_context_name):
                left_context_feat = np.load(self.sliding_clip_path+left_context_name)
                last_left_feat = left_context_feat
            else:
                left_context_feat = last_left_feat
            if os.path.exists(self.sliding_clip_path+right_context_name):
                right_context_feat = np.load(self.sliding_clip_path+right_context_name)
                last_right_feat = right_context_feat
            else:
                right_context_feat = last_right_feat
            left_context_feats[k] = left_context_feat
            right_context_feats[k] = right_context_feat
        return left_context_feats, right_context_feats

    
    
    

    def get_fast_rcnn_vector(self, clip_name):
        movie_name = clip_name.split("_")[0].strip('.avi')
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].strip('.npy'))
        vec_length = end-start
        name = self.train_fast_rcnn_path+movie_name+'.pkl'
        vecs = pickle.load(open(name,'rb'))
        seq_len = 64
        if vec_length < seq_len:
            clip_vec = vecs[start-1:end-1]
            clip_vec = np.vstack((clip_vec,np.zeros([seq_len-vec_length,2048])))
        else:
            clip_vec = vecs[start-1:start+63]
        return clip_vec
   
    


    def next_batch_iou(self):
        
        random_batch_index = random.sample(range(self.num_samples_iou), self.batch_size)
        #fast_rcnn_vec_batch
        fast_rcnn_vec_batch = np.zeros([self.batch_size,64,2048])
        image_batch = np.zeros([self.batch_size, self.visual_feature_dim, 2 * self.context_num + 1])
        text_seq_batch = []
        noun_seq_batch = []
        
        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32)
        index = 0
        clip_set = set()
        while index < self.batch_size:
            k = random_batch_index[index]
            clip_name = self.clip_sentence_pairs_iou[k][0]
                
            if not clip_name in clip_set:
                clip_set.add(clip_name)
                movie_name=clip_name.split('_')[0]
                
                
                clip_vec = self.get_fast_rcnn_vector(self.clip_sentence_pairs_iou[k][2])#(64,2048)
                fast_rcnn_vec_batch[index, :, :] = clip_vec
               
                noun_seq_batch.append(preprocess_sentence(self.clip_sentence_pairs_iou[k][5], vocab_dict, T))
                       
             
                feat_path = self.sliding_clip_path+self.clip_sentence_pairs_iou[k][2]
                featmap = np.load(feat_path)
                # read context features
                left_context_feat, right_context_feat = self.get_context_window(self.clip_sentence_pairs_iou[k][2], self.context_num)
                left_context_feat = np.reshape(left_context_feat, [self.visual_feature_dim])
                right_context_feat = np.reshape(right_context_feat, [self.visual_feature_dim])            
                image_batch[index, :, :] = np.column_stack((left_context_feat, featmap, right_context_feat))#batchsize,4096,3       
                text_seq_batch.append(preprocess_sentence(self.clip_sentence_pairs_iou[k][1], vocab_dict, T))
                #iou
                p_offset = self.clip_sentence_pairs_iou[k][3]#start_current-start_left -62
                l_offset = self.clip_sentence_pairs_iou[k][4]#end_current-end_left    41
                offset_batch[index, 0] = p_offset
                offset_batch[index, 1] = l_offset
                index += 1
            else:
                r = random.choice(range(self.num_samples_iou))
                random_batch_index[index] = r
                continue
           
        return image_batch, text_seq_batch, offset_batch,fast_rcnn_vec_batch,noun_seq_batch  
        #(batchsize,4096,3) (batchsize,T) (batchsize,64,2048) (batchsize,T)

In [None]:
class TestingDataSet(object):
    def __init__(self, csv_path, batch_size,img_dir,test_fast_rcnn_path):
        
        self.batch_size = batch_size
        self.image_dir = img_dir
        self.visual_feature_dim=4096
        print ("Reading testing data list from ")
        self.sliding_clip_path = img_dir
        self.test_fast_rcnn_path = test_fast_rcnn_path
        
        self.clip_sentence_pairs=pickle.load(open(csv_path,'rb'))
       
        movie_names_set = set()
        self.movie_clip_names = {}
        for k in range(len(self.clip_sentence_pairs)):
            clip_name = self.clip_sentence_pairs[k][0]
            movie_name = clip_name.split("_")[0]
            if not movie_name in movie_names_set:
                movie_names_set.add(movie_name)
                self.movie_clip_names[movie_name] = []
            self.movie_clip_names[movie_name].append(k)
        self.movie_names = list(movie_names_set)

        self.clip_num_per_movie_max = 0
        for movie_name in self.movie_clip_names:
            if len(self.movie_clip_names[movie_name])>self.clip_num_per_movie_max: self.clip_num_per_movie_max = len(self.movie_clip_names[movie_name])
        print ("Max number of clips in a movie is "+str(self.clip_num_per_movie_max))

        
        

    def get_fast_rcnn_vector(self, clip_name):
        movie_name = clip_name.split("_")[0].strip('.avi')
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].strip('.npy'))
        vec_length = end-start
        name = self.test_fast_rcnn_path+movie_name+'.pkl'
        vecs = pickle.load(open(name,'rb'))
        seq_len = 64
        if vec_length < seq_len:
            clip_vec = vecs[start-1:end-1]
            clip_vec = np.vstack((clip_vec,np.zeros([seq_len-vec_length,2048])))
        else:
            clip_vec = vecs[start-1:start+63]
        return clip_vec
             
        
    def get_context_window(self, clip_name, win_length):
        movie_name = clip_name.split("_")[0]
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].split('.')[0])
        clip_length = 128
        left_context_feats = np.zeros([win_length,4096], dtype=np.float32)
        right_context_feats = np.zeros([win_length,4096], dtype=np.float32)
        last_left_feat = np.load(self.sliding_clip_path+clip_name)
        last_right_feat = np.load(self.sliding_clip_path+clip_name)
        for k in range(win_length):
            left_context_start = start-clip_length*(k+1)
            left_context_end = start-clip_length*k
            right_context_start = end+clip_length*k
            right_context_end = end+clip_length*(k+1)
            left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
            right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
            if os.path.exists(self.sliding_clip_path+left_context_name):
                left_context_feat = np.load(self.sliding_clip_path+left_context_name)
                last_left_feat = left_context_feat
            else:
                left_context_feat = last_left_feat
            if os.path.exists(self.sliding_clip_path+right_context_name):
                right_context_feat = np.load(self.sliding_clip_path+right_context_name)
                last_right_feat = right_context_feat
            else:
                right_context_feat = last_right_feat
            left_context_feats[k] = left_context_feat
            right_context_feats[k] = right_context_feat

        return left_context_feats,right_context_feats

        
        
    def load_movie_slidingclip(self, movie_name, sample_num):
        movie_clip_sentences = []
        movie_clip_featmap = []

        for k in range(len(self.clip_sentence_pairs)):
            if movie_name in self.clip_sentence_pairs[k][0]:
                #clip_name,query,noun_word
                movie_clip_sentences.append((self.clip_sentence_pairs[k][0], self.clip_sentence_pairs[k][1],self.clip_sentence_pairs[k][2]))
        sliding_clip_names=os.listdir(self.sliding_clip_path)
        for k in range(len(sliding_clip_names)):
           
            if "npy" in sliding_clip_names[k]:
                
                if movie_name in sliding_clip_names[k]:
                    # print (str(k)+"/"+str(len(self.movie_clip_names[movie_name])))
                    visual_feature_path = self.sliding_clip_path+sliding_clip_names[k]#clip_name
                    #context_feat=self.get_context(self.sliding_clip_names[k]+".npy")
                    left_context_feat,right_context_feat = self.get_context_window(sliding_clip_names[k],1)
                    feature_data = np.load(visual_feature_path)
                    left_context_feat=np.reshape(left_context_feat,[self.visual_feature_dim])
                    right_context_feat=np.reshape(right_context_feat,[self.visual_feature_dim])
                    
                    #fast_rcnn_vector
                    fast_rcnn_vec = self.get_fast_rcnn_vector(sliding_clip_names[k])#64*2048
                    

                    #comb_feat=np.hstack((context_feat,feature_data))
                    comb_feat = np.column_stack((left_context_feat,feature_data,right_context_feat))
                    movie_clip_featmap.append((sliding_clip_names[k], comb_feat,fast_rcnn_vec))
        return movie_clip_featmap, movie_clip_sentences
    
        

In [None]:
def conv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
               bias_term=True, weights_initializer=None, biases_initializer=None):
    # input has shape [batch, in_height, in_width, in_channels]
    input_dim = bottom.get_shape().as_list()[-1]

    with tf.variable_scope(name):
        if weights_initializer is None and biases_initializer is None:
            # initialize the variables
            if weights_initializer is None:
                weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
            if bias_term and biases_initializer is None:
                biases_initializer = tf.constant_initializer(0.)
            print ("input_dim"+str(input_dim))
            # filter has shape [filter_height, filter_width, in_channels, out_channels]
            weights = tf.get_variable("weights",
                [kernel_size, kernel_size, input_dim, output_dim],
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", output_dim,
                    initializer=biases_initializer)
            print (str(weights.name)+" initialized as random or retrieved from graph")
            if bias_term:
                print (biases.name+" initialized as random or retrieved from graph")

        else:
            weights = tf.get_variable("weights",
                shape=None,
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", shape=None,
                    initializer=biases_initializer) 

            print (weights.name+" initialized from pre-trained parameters or retrieved from graph")
            if bias_term:
                print (biases.name+" initialized from pre-trained parameters or retrieved from graph")


    conv = tf.nn.conv2d(bottom, filter=weights,
        strides=[1, stride, stride, 1], padding=padding)
    if bias_term:
        conv = tf.nn.bias_add(conv, biases)
    return conv

def conv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                    bias_term=True, weights_initializer=None, biases_initializer=None):
    conv = conv_layer(name, bottom, kernel_size, stride, output_dim, padding,
                      bias_term, weights_initializer, biases_initializer)
    relu = tf.nn.relu(conv)
    return relu

def deconv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                 bias_term=True, weights_initializer=None, biases_initializer=None):
    # input_shape is [batch, in_height, in_width, in_channels]
    input_shape = bottom.get_shape().as_list()
    batch_size, input_height, input_width, input_dim = input_shape
    output_shape = [batch_size, input_height*stride, input_width*stride, output_dim]

    # weights and biases variables
    with tf.variable_scope(name):
        # initialize the variables
        if weights_initializer is None:
            weights_initializer = tf.random_normal_initializer()
        if bias_term and biases_initializer is None:
            biases_initializer = tf.constant_initializer(0.)

        # filter has shape [filter_height, filter_width, out_channels, in_channels]
        weights = tf.get_variable("weights",
            [kernel_size, kernel_size, output_dim, input_dim],
            initializer=weights_initializer)
        if bias_term:
            biases = tf.get_variable("biases", output_dim,
                initializer=biases_initializer)

    deconv = tf.nn.conv2d_transpose(bottom, filter=weights,
        output_shape=output_shape, strides=[1, stride, stride, 1],
        padding=padding)
    if bias_term:
        deconv = tf.nn.bias_add(deconv, biases)
    return deconv


def deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                      bias_term=True, weights_initializer=None, biases_initializer=None):
    deconv = deconv_layer(name, bottom, kernel_size, stride, output_dim, padding,
                          bias_term, weights_initializer, biases_initializer)
    relu = tf.nn.relu(deconv)
    return relu

def pooling_layer(name, bottom, kernel_size, stride):
    pool = tf.nn.max_pool(bottom, ksize=[1, kernel_size, kernel_size, 1],
        strides=[1, stride, stride, 1], padding='SAME', name=name)
    return pool

def fc(name, bottom, output_dim, bias_term=True, weights_initializer=None,
             biases_initializer=None):

    shape = bottom.get_shape().as_list()
    input_dim = 1

    for d in shape[1:]:
        input_dim *= d
    flat_bottom = tf.reshape(bottom, [-1, input_dim])
    
    # weights and biases variables
    with tf.variable_scope(name):
        if weights_initializer is None and biases_initializer is None:
            # initialize the variables
            if weights_initializer is None:
                weights_initializer = tf.random_normal_initializer()
            if bias_term and biases_initializer is None:
                biases_initializer = tf.constant_initializer(0.)

            # weights has shape [input_dim, output_dim]
            weights = tf.get_variable("weights", [input_dim, output_dim],
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", output_dim,
                    initializer=biases_initializer)

            print (weights.name+" initialized as random or retrieved from graph")
            if bias_term:
                print (biases.name+" initialized as random or retrieved from graph")
        else:
            weights = tf.get_variable("weights", shape=None,
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", shape=None,
                    initializer=biases_initializer)

            print (weights.name+" initialized from pre-trained parameters or retrieved from graph")
            if bias_term:
                print (biases.name+" initialized from pre-trained parameters or retrieved from graph")

    if bias_term:
        fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)
    else:
        fc = tf.matmul(flat_bottom, weights)
    return fc

def fc_relu_layer(name, bottom, output_dim, bias_term=True,
                  weights_initializer=None, biases_initializer=None):
    fc = fc(name, bottom, output_dim, bias_term, weights_initializer,
                  biases_initializer)
    relu = tf.nn.relu(fc)
    return relu

def softmax_loss_layer(name, score_bottom, label_bottom):
    # Check shape
    score_shape = score_bottom.get_shape().as_list()
    label_shape = label_bottom.get_shape().as_list()
    assert len(score_shape) == len(label_shape) + 1
    assert score_shape[:-1] == label_shape

    # Compute the outer dimensions dimensions in label
    inner_dim = score_shape[-1]
    outer_dim = 1
    for d in label_shape: outer_dim *= d

    # flatten score and label
    flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim])
    flat_label = tf.reshape(label_bottom, [outer_dim, 1])

    # Reshape the labels into a dense Tensor of
    # shape [batch_size, NUM_CLASSES].
    sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1])
    indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1])
    concated = tf.concat([indices, sparse_labels],1)
    dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES],
        1.0, 0.0)

In [None]:
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print (name+" reuse variables")
            tf.get_variable_scope().reuse_variables()
        else:
            print (name+" doesn't reuse variables")

        layer1 = conv_relu_layer('layer1', input_batch,
                        kernel_size=1,stride=1,output_dim=middle_layer_dim)
        sim_score = conv_layer('layer2', layer1,
                        kernel_size=1,stride=1,output_dim=3)
    return sim_score

In [None]:
wordembed_params = './word_embedding/embed_matrix.npy'
embedding_mat = np.load(wordembed_params)#(72704, 300)
class SLAT_Model(object):
    #T=10,train_feature_dir,test_feature_dir,semantic_size=1024,mpl_hidden=1000)
    def __init__(self, batch_size, train_visual_feature_dir, test_visual_feature_dir, lr, n_input_visual, n_input_text,
                   n_hidden_text, n_step_text,train_sliding_dir,test_sliding_dir,semantic_size,mpl_hidden,train_fast_rcnn_path,test_fast_rcnn_path):
        self.batch_size = batch_size#30
        self.test_batch_size = 1
        self.vs_lr = lr#0.01
        self.n_input_visual = n_input_visual #4096
        self.n_input_text = n_input_text#300
        self.n_step_text = n_step_text#T=10
        self.n_hidden_text=n_hidden_text
        self.context_size = 128
        self.context_num = 1
        self.visual_feature_dim=4096
        self.lambda_regression = 0.01#self.lambda_regression*loss_reg+loss_aln
        self.alpha = 1.0 / batch_size
        self.train_set=TrainingDataSet(train_visual_feature_dir, self.batch_size,train_sliding_dir,train_fast_rcnn_path) 
        self.test_set=TestingDataSet(test_visual_feature_dir, self.test_batch_size,test_sliding_dir,test_fast_rcnn_path)
        self.semantic=semantic_size#1024
        self.mpl_hidden=mpl_hidden#1000
    '''
    used in training alignment model, ROLE(aln)
    '''
    def fill_feed_dict_train(self):
        image_batch,sentence_batch,offset_batch,fast_rcnn_vec_batch,noun_seq_batch  = self.train_set.next_batch()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,#(batchsize,4096,3)
                self.sentence_ph_train: sentence_batch,#(batchsize,T)
                self.offset_ph: offset_batch,#(batchsize,2)
            
                self.fast_rcnn_vec_train: fast_rcnn_vec_batch,#(batchsize,64,2048)
                self.noun_word_ph_train: noun_seq_batch#(batchsize,T)
                       
            
        }

        return input_feed
    
    '''
    used in training alignment+regression model, ROLE(reg)
    
    '''
    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch,fast_rcnn_vec_batch,noun_seq_batch = self.train_set.next_batch_iou()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,  #batch_size,4096,3
                self.sentence_ph_train: sentence_batch,  #batch_size*(10)
                self.offset_ph: offset_batch,   #(batchsize,2)
                self.fast_rcnn_vec_train: fast_rcnn_vec_batch,  #(batchsize,64,2048)
                self.noun_word_ph_train: noun_seq_batch    #(batchsize,T)        
        }

        return input_feed

    def bilstm(self,x):
        """RNN (LSTM or GRU) model for image"""
        # x.shape [N,T,D] batch_size，10，300
        x=tf.transpose(x,[1,0,2])# [T,N,D] 10，batch_size,300
        fw_x = tf.reshape(x, [-1, self.n_input_text]) # 10*batch,300
        fw_x = tf.split(fw_x, self.n_step_text,0)
        with tf.variable_scope('bilstm_lt'):
            #one-layer bilstm,n_hidden_text=1000
            lstm_fw_cell = rnn.BasicLSTMCell(self.n_hidden_text, forget_bias=1.0, state_is_tuple=True)
            lstm_bw_cell = rnn.BasicLSTMCell(self.n_hidden_text, forget_bias=1.0, state_is_tuple=True)
            #dropout
            #lstm_fw_cell = rnn_cell.DropoutWrapper(cell=lstm_fw_cell, input_keep_prob=1.0, output_keep_prob=keep_prob)
            #lstm_bw_cell = rnn_cell.DropoutWrapper(cell=lstm_bw_cell, input_keep_prob=1.0, output_keep_prob=keep_prob)
            with tf.variable_scope('fw_lt'):
                (output_fw, state_fw) = rnn.static_rnn(lstm_fw_cell,fw_x,dtype=tf.float32)
                t=tf.convert_to_tensor(output_fw)
                print (t.get_shape().as_list())#10,30,1000
            # backward direction
            with tf.variable_scope('bw_lt'):
                bw_x = tf.reverse(x, [0])# reverse time dim
                bw_x = tf.reshape(bw_x, [-1, self.n_input_text])  # step*batch, feature
                bw_x = tf.split(bw_x, self.n_step_text, 0)
                (output_bw, state_bw) = rnn.static_rnn(lstm_bw_cell,bw_x,dtype=tf.float32)
            # output_bw.shape = [timestep_size, batch_size, hidden_size]
            output_bw = tf.reverse(output_bw, [0])
            output = tf.concat([output_fw, output_bw],2)#10,30,2000    T，N，1000+1000
        return output
    
    
    def cross_modal_comb(self, input_vision_obj1,BoW_obj1,video_output_norm,batch_size):
        #[batch_size, batch_size, 4096*3=12288]
        vv_feature= tf.reshape(tf.tile(input_vision_obj1, [batch_size, 1]),[batch_size, batch_size, self.visual_feature_dim*(2*self.context_num+1)])
        ss_feature= tf.reshape(tf.tile(BoW_obj1,[1, batch_size]),[batch_size, batch_size, self.n_input_text])
        fast_rcnn_feature = tf.reshape(tf.tile(video_output_norm,[1, batch_size]),[batch_size, batch_size, 512])
        #[1,batch_size, batch_size, 12288+300+512]
        concat_feature1 = tf.concat([fast_rcnn_feature, ss_feature],2)
        concat_feature = tf.reshape(tf.concat([vv_feature, concat_feature1],2),[1,batch_size, batch_size, self.visual_feature_dim*(2*self.context_num+1)+self.n_input_text+512])
        return concat_feature

    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''
    
    
    def visual_semantic_infer(self,visual_feature_train,text_feature_train,fast_rcnn_feature_train,noun_word_feature_train,visual_feature_test,text_feature_test,fast_rcnn_feature_test,noun_word_feature_test):
        name="CTRL_Model"
        
       
        with tf.variable_scope(name):
            
            # text_feature_train [N,T] shape word index matrix
            # 0. Word embedding
            # text_seq has shape [N, T] and embedded_seq has shape [N, T, D].
            #tex_seq [batch_size,10],embedded_seq[batch_size,10,300]
            #embedding_mat(72704, 300)
            embedded_seq_train = tf.nn.embedding_lookup(embedding_mat, text_feature_train)#input:N,T  output:N,T,300
            
            
            # 1. Encode the sentence into a vector representation, using the final hidden states in a one-layer bidirectional LSTM network
            q_reshape=self.bilstm(embedded_seq_train)
            print (q_reshape.get_shape().as_list())
            
            
            # 1.1 noun_word(batch,10)  embedding(batch,10,300)
            embedded_noun_word_train = tf.nn.embedding_lookup(embedding_mat,noun_word_feature_train)
            #GRU encode 
            #embedded_noun_word_train = tf.convert_to_tensor(embedded_noun_word_train,dtype = tf.float32)
            encoded_vector = CuDNNGRU(512,return_state=False)(embedded_noun_word_train)#30,512
            embedded_noun_word = encoded_vector #(30,512)
            
            
            # 1.2 fast_rcnn_vec(batch,64,2048)
            #fast_rcnn_feature_train = tf.convert_to_tensor(fast_rcnn_feature_train,dtype = tf.float32)
            #video_vector = CuDNNGRU(512,return_state=False,return_sequences=True)(fast_rcnn_feature_train)#(batch, 64, 512)
            video_vector = CuDNNGRU(512,return_state=False)(fast_rcnn_feature_train)#(batch,512)
            # 1.3 top_down_attention
            video_output = video_vector#(30,512)
            #video_output = top_down_attention(video_vector,embedded_noun_word)#(30, 512)
            #print (video_output.get_shape().as_list())

             
            # 2. attention units over the words in each sentence fc(fc(Q+C+PRE+POST))
            q_reshape_flat = tf.reshape(q_reshape, [self.n_step_text * self.batch_size, self.n_hidden_text * 2])#30*10,1000*2
            visual_feature_train=tf.transpose(visual_feature_train, [0, 2, 1])  # batch ctx fea  30,3,4096
            visual_train=tf.reshape(visual_feature_train,[self.batch_size*(2*self.context_num+1),self.visual_feature_dim])#30*3，4096
            query_term=fc('q2s_lt', q_reshape_flat, output_dim=self.semantic)#30*10,1024
            moment_term=fc('c2s_lt', visual_train, output_dim=self.semantic)#30*3,1024

            query_term=tf.reshape(query_term,[self.batch_size,self.n_step_text,self.semantic])#(30,10,1024)
            moment_term=tf.reshape( moment_term,[self.batch_size,2*self.context_num+1,self.semantic])#(30,3,1024)
            
            term2=tf.reduce_sum(moment_term,1,keep_dims=True)
            term2=tf.reshape(term2,[self.batch_size,self.semantic])#(30, 1024)
            
        
            term2=tf.reshape(tf.tile(term2,[1,self.n_step_text]),[self.batch_size,self.n_step_text,self.semantic])
            #(30,10,1024)add(30,10,1024),reshape(300,1024)
            term=tf.nn.relu(tf.reshape(tf.add(query_term,term2), [self.n_step_text * self.batch_size, self.semantic]))#(300, 1024)
            scores_obj1 = fc('fc_scores_query_lt',term, output_dim=1)
            scores_obj1_train=tf.reshape(scores_obj1,[self.batch_size,self.n_step_text])#(30,10)
            is_not_pad=tf.cast(tf.not_equal(text_feature_train,0),tf.float32)
            #probs_obj1=tf.nn.softmax(scores_obj1_train)
            #probs_obj1=tf.multiply(probs_obj1,is_not_pad)
            probs_obj1 = tf.multiply(scores_obj1_train,is_not_pad)#(30,10)
            probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 1, keep_dims=True)#(30,10)/(30,1)
            
            #[N,T,embed_dim] (30,10,300)
            temp1= tf.transpose(tf.reshape(tf.tile(probs_obj1, [1, self.n_input_text]),[self.batch_size, self.n_input_text, self.n_step_text]),[0,2,1])
            #(30,300)
            BoW_obj1 = tf.reduce_sum(tf.multiply(temp1,embedded_seq_train), reduction_indices=1)
            print (BoW_obj1.get_shape().as_list())#(30,300)
            
            
            
            #3.0 visual attention part: x_i=Wv_i+b then softmax(x_i.q_j) output: [N, visual_feature_size] 
            input_vision_obj1=tf.reshape(visual_feature_train,[self.batch_size,-1])#(30,12288)
            # cross-modal part
            transformed_clip_train_norm = tf.nn.l2_normalize(input_vision_obj1 , dim=1)
            transformed_obj1_sent_train_norm = tf.nn.l2_normalize(BoW_obj1, dim=1)
            transformed_video_output_norm = tf.nn.l2_normalize(video_output, dim=1)
            
            
            #(1,30,30,12588)
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm,transformed_obj1_sent_train_norm,transformed_video_output_norm,self.batch_size)  # batch batch 2*conmmon_space_dim
            sim_score_mat_train = vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt",self.mpl_hidden)
            sim_score_mat_train = tf.reshape(sim_score_mat_train, [self.batch_size,self.batch_size,3])#(30,30,3)
            tf.get_variable_scope().reuse_variables()
            
            
            print ("Building test network...............................\n")
            
            
            # text_seq has shape [T, N] and embedded_seq has shape [self.test_batch_size, T, D].[1,10,300]
            embedded_seq_test = tf.nn.embedding_lookup(embedding_mat, text_feature_test)#[1,10,300]
            # 1. Encode the sentence into a vector representation, using the final
            # hidden states in a one-layer bidirectional LSTM network
            q_reshape_test = self.bilstm(embedded_seq_test)
            
            
            # 1.1 noun_word(1,10)  embedding(1,10,300)
            embedded_noun_word_test = tf.nn.embedding_lookup(embedding_mat,noun_word_feature_test)
            #GRU encode 
            #embedded_noun_word_test = tf.convert_to_tensor(embedded_noun_word_test,dtype = tf.float32)
            encoded_vector_test = CuDNNGRU(512,return_state=False)(embedded_noun_word_test)#1,512
            embedded_noun_word_test = encoded_vector_test #(1,512)
            # 1.2 fast_rcnn_vec(batch,64,2048)
            #fast_rcnn_feature_train = tf.convert_to_tensor(fast_rcnn_feature_train,dtype = tf.float32)
            #video_vector_test = CuDNNGRU(512,return_state=False,return_sequences=True)(fast_rcnn_feature_test)#(1, 64, 512)
            video_vector_test = CuDNNGRU(512,return_state=False)(fast_rcnn_feature_test)
            # 1.3 top_down_attention
            #video_output_test = top_down_attention(video_vector_test,embedded_noun_word_test)#(1, 512)
            video_output_test = video_vector_test
            print (video_output_test.get_shape().as_list())
                  
            
            # 2. three attention units over the words in each sentence
            # 2. attention units over the words in each sentence fc(fc(Q+C+PRE+POST))
            q_reshape_flat = tf.reshape(q_reshape_test, [self.n_step_text * self.test_batch_size, self.n_hidden_text * 2])#(10,2000)
            visual_feature_test=tf.transpose(visual_feature_test, [0, 2, 1])  # batch ctx fea   (1,3,4096)
            visual_test=tf.reshape(visual_feature_test,[self.test_batch_size*(2*self.context_num+1),self.visual_feature_dim]) #(3,4096)
            query_term=fc('q2s_lt', q_reshape_flat, output_dim=self.semantic) #(10,2024)
            moment_term=fc('c2s_lt', visual_test, output_dim=self.semantic)  #(3,1024)
            
            query_term=tf.reshape(query_term,[self.test_batch_size,self.n_step_text,self.semantic])  #(1,10,1024)
            moment_term=tf.reshape( moment_term,[self.test_batch_size,2*self.context_num+1,self.semantic])#(1,3,1024)
            term2=tf.reduce_sum(moment_term,1,keep_dims=True) #(1,1,1024)
            term2=tf.reshape(term2,[self.test_batch_size,self.semantic]) #(1,1024)
            term2=tf.reshape(tf.tile(term2,[1,self.n_step_text]),[self.test_batch_size,self.n_step_text,self.semantic]) #(1,10,1024)
            term=tf.nn.relu(tf.reshape(tf.add(query_term,term2), [self.n_step_text * self.test_batch_size, self.semantic])) #(10,1024)
            scores_obj1 = fc('fc_scores_query_lt',term, output_dim=1) #(10,1)
            scores_obj1=tf.reshape(scores_obj1,[self.test_batch_size,self.n_step_text]) #(1,10)
            is_not_pad=tf.cast(tf.not_equal(text_feature_test,0),tf.float32) 
            #probs_obj1=tf.nn.softmax(scores_obj1)
            #probs_obj1=tf.multiply(probs_obj1,is_not_pad)
            probs_obj1 = tf.multiply(scores_obj1,is_not_pad)
            probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 1, keep_dims=True)
            temp1= tf.transpose(tf.reshape(tf.tile(probs_obj1, [1, self.n_input_text]),[self.test_batch_size, self.n_input_text, self.n_step_text]),[0,2,1])  # [N,T,embed_dim]
            BoW_obj1 = tf.reduce_sum(tf.multiply(temp1,embedded_seq_test), reduction_indices=1)
            print (BoW_obj1.get_shape().as_list())  #(1,300)
            
            input_vision_obj1=tf.reshape(visual_feature_test,[self.test_batch_size,-1]) #(1,12280)
            # cross-modal part
            transformed_clip_test_norm = tf.nn.l2_normalize(input_vision_obj1 , dim=1)
            transformed_obj1_sent_test_norm = tf.nn.l2_normalize(BoW_obj1, dim=1)
            transformed_video_output_test_norm = tf.nn.l2_normalize(video_output_test, dim=1)#(1,512)
            
            
            
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm,transformed_obj1_sent_test_norm,transformed_video_output_test_norm,self.test_batch_size)  # batch batch 2*conmmon_space_dim
            sim_score_mat_test =vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt",self.mpl_hidden,reuse=True)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])
            return sim_score_mat_train, sim_score_mat_test #(30,30,3)  (1,1,3)

        
    '''
    compute alignment and regression loss
    
    '''
    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2)
        sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)#shape=(30, 30)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        #I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I,batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.multiply(mask_mat, sim_score_mat))))
        loss_mat = tf.multiply(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.multiply(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.multiply(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat((p_reg_diag, l_reg_diag),1)
        loss_reg = tf.reduce_mean(tf.abs(tf.subtract(offset_pred, offset_label)))

        loss=tf.add(tf.multiply(self.lambda_regression, loss_reg), loss_align)#L=L(align)+lambda*L(location)
        return loss, offset_pred, loss_reg


    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim,2 * self.context_num + 1))  # (50,4096,3)
        sentence_ph_train = tf.placeholder(tf.int32, shape=(self.batch_size,self.n_step_text ))#(30,10)
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))#30,2
        
        fast_rcnn_vec_train = tf.placeholder(tf.float32, shape=(self.batch_size, 64,2048))#(30,64,2048)
        noun_word_ph_train = tf.placeholder(tf.int32, shape=(self.batch_size,self.n_step_text ))#(30,10)
        
        visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim,2 * self.context_num + 1))  # input feature: current clip, pre-contex, and post contex （1,4096,3）
        sentence_ph_test = tf.placeholder(tf.int32, shape=(self.test_batch_size,self.n_step_text ))#batch_size=1,T=10 （1,10）
        
        fast_rcnn_vec_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, 64,2048))
        noun_word_ph_test = tf.placeholder(tf.int32, shape=(self.test_batch_size,self.n_step_text ))#(1,10)
  
        
        return visual_featmap_ph_train, sentence_ph_train, offset_ph,fast_rcnn_vec_train, noun_word_ph_train,visual_featmap_ph_test, sentence_ph_test,fast_rcnn_vec_test,noun_word_ph_test

    
    def get_variables_by_name(self,name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print ("Variables of <"+name+">")
            for v in v_dict[name]:
                print ("    "+v.name)
        return v_dict

    def training(self, loss):

        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op


    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train,self.offset_ph,self.fast_rcnn_vec_train,self.noun_word_ph_train,self.visual_featmap_ph_test, self.sentence_ph_test,self.fast_rcnn_vec_test,self.noun_word_ph_test=self.init_placeholder()#train:(50,4096,3)  (50,10) (50,2)  test:(1,4096,3) (1,10)
        # build inference network
        sim_reg_mat, sim_reg_mat_test= self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train,self.fast_rcnn_vec_train,self.noun_word_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test,self.fast_rcnn_vec_test,self.noun_word_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg


In [None]:
def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset+labels_dense.ravel()] = 1
    return labels_one_hot


def calculate_IoU(i0,i1):
    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
    return iou

def nms_temporal(x1,x2,s, overlap):
    pick = []
    assert len(x1)==len(s)
    assert len(x2)==len(s)
    if len(x1)==0:
        return pick

    #x1 = [b[0] for b in boxes]
    #x2 = [b[1] for b in boxes]
    #s = [b[-1] for b in boxes]
    union = list(map(operator.sub, x2, x1)) # union = x2-x1
    I = [i[0] for i in sorted(enumerate(s), key=lambda x:x[1])] # sort and get index

    while len(I)>0:
        i = I[-1]
        pick.append(i)
        xx1 = [max(x1[i],x1[j]) for j in I[:-1]]
        xx2 = [min(x2[i],x2[j]) for j in I[:-1]]
        inter = [max(0.0, k2-k1) for k1, k2 in zip(xx1, xx2)]
        o = [inter[u]/(union[i] + union[I[u]] - inter[u]) for u in range(len(I)-1)]
        I_new = []
        for j in range(len(o)):
            if o[j] <=overlap:
                I_new.append(I[j])
        I = I_new
    return pick

'''
compute recall at certain IoU
'''
def compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_mat, sentence_image_reg_mat, sclips, iclips,movie_clip_sentences):
    correct_num = 0.0
    for k in range(sentence_image_mat.shape[0]):
        gt = sclips[k]
        gt_start = float(gt.split("_")[1])
        gt_end = float(gt.split("_")[2].split('.')[0])
        #print gt +" "+str(gt_start)+" "+str(gt_end)
        sim_v = [v for v in sentence_image_mat[k]]
        starts = [s for s in sentence_image_reg_mat[k,:,0]]
        ends = [e for e in sentence_image_reg_mat[k,:,1]]
        picks = nms_temporal(starts,ends, sim_v, iou_thresh-0.05)
        if top_n<len(picks): picks=picks[0:top_n]
        for index in picks:
            pred_start = sentence_image_reg_mat[k, index, 0]
            pred_end = sentence_image_reg_mat[k, index, 1]
            iou = calculate_IoU((gt_start, gt_end),(pred_start, pred_end))
            if iou>=iou_thresh:
                correct_num+=1
                break
    return correct_num

'''
evaluate the model
'''
def do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, iter_step,context_num):
    IoU_thresh = [0.1,0.3, 0.5,0.7,0.9]
    all_correct_num_10 = [0.0]*5
    all_correct_num_5 = [0.0]*5
    all_correct_num_1 = [0.0]*5
    all_retrievd = 0.0
    for movie_name in model.test_set.movie_names:
        print ("Test movie: "+movie_name+"....loading movie data")
        movie_clip_featmaps, movie_clip_sentences=model.test_set.load_movie_slidingclip(movie_name, 16)#sample = 16
        print ("sentences: "+ str(len(movie_clip_sentences)))
        print ("clips: "+ str(len(movie_clip_featmaps)))
        sentence_image_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps)])
        sentence_image_reg_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps), 2])
        for k in range(len(movie_clip_sentences)):
           
            sent_vec=movie_clip_sentences[k][1]
            sent_vec = preprocess_sentence(sent_vec, vocab_dict, T)
            sent_vec=np.reshape(sent_vec,[1,T])#(1,T)
            
            noun_word_vec = movie_clip_sentences[k][2]
            noun_word_vec = preprocess_sentence(noun_word_vec, vocab_dict, T)
            noun_word_vec = np.reshape(noun_word_vec,[1,T])#(1,T)
            
            
            for t in range(len(movie_clip_featmaps)):
                featmap = movie_clip_featmaps[t][1] #(4096,3)
                visual_clip_name = movie_clip_featmaps[t][0]
                start = float(visual_clip_name.split("_")[1])
                end = float(visual_clip_name.split("_")[2].split(".")[0])
                featmap = np.reshape(featmap, [ 1, featmap.shape[0],2*context_num+1]) #1, 4096, 3
                
                fast_rcnn_fea = movie_clip_featmaps[t][2] #(64,2048)
                fast_rcnn_fea =  np.reshape(fast_rcnn_fea, [1,64,2048]) 
                
                
                feed_dict = {
                model.visual_featmap_ph_test: featmap,
                model.sentence_ph_test:sent_vec,
                    
                model.fast_rcnn_vec_test:fast_rcnn_fea,
                model.noun_word_ph_test:noun_word_vec
                    
                }
                outputs = sess.run(vs_eval_op,feed_dict=feed_dict)
                sentence_image_mat[k,t] = outputs[0]
                reg_end = end+outputs[2]
                reg_start = start+outputs[1]

                sentence_image_reg_mat[k,t,0] = reg_start
                sentence_image_reg_mat[k,t,1] = reg_end
        iclips = [b[0] for b in movie_clip_featmaps]
        sclips = [b[0] for b in movie_clip_sentences]

        # calculate Recall@m, IoU=n
        for k in range(len(IoU_thresh)):
            IoU=IoU_thresh[k]
            correct_num_10 = compute_IoU_recall_top_n_forreg(10, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips,movie_clip_sentences)
            correct_num_5 = compute_IoU_recall_top_n_forreg(5, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips,movie_clip_sentences)
            correct_num_1 = compute_IoU_recall_top_n_forreg(1, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips,movie_clip_sentences)
            #print (movie_name+" IoU="+str(IoU)+", R@10: "+str(correct_num_10/len(sclips))+"; IoU="+str(IoU)+", R@5: "+str(correct_num_5/len(sclips))+"; IoU="+str(IoU)+", R@1: "+str(correct_num_1/len(sclips)))
            all_correct_num_10[k]+=correct_num_10
            all_correct_num_5[k]+=correct_num_5
            all_correct_num_1[k]+=correct_num_1
        all_retrievd+=len(sclips)
    for k in range(len(IoU_thresh)):
        print (" IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd))
        

In [None]:
def run_training():
    max_steps=50000
    lr=0.01
    current_lr=lr
    train_batch_size=30
    display_step=5
    test_iter=5000
    semantic_size=1024
    mpl_hidden=1000
    context_num=1
    visual_feature_dim=4096
    embed_dim = 300
    lstm_dim = 1000
    train_csv_path = "./charades_300/clip_sentence_pairs_iou_charades_noun_word.pkl"
    test_csv_path = "./charades_300/charades_clip_sentence_test_new_noun_word.pkl"
    train_feature_dir = "./charades_C3D/charades_16_32_48_64_overlap0.8_c3d_fc6_train_pool/"
    test_feature_dir="./charades_C3D/charades_16_32_overlap0.8_c3d_fc6_test_pool/"
    train_fast_rcnn_path='./charades_fastrcnn/Charades_train_video_pics_vec_pool/'
    test_fast_rcnn_path='./charades_fastrcnn/Charades_test_video_pics_vec_pool/'
    
    
    
    model = SLAT_Model(train_batch_size, train_csv_path, test_csv_path , lr, visual_feature_dim, embed_dim,lstm_dim, T,train_feature_dir,test_feature_dir,semantic_size,mpl_hidden,train_fast_rcnn_path,test_fast_rcnn_path)
    
    with tf.Graph().as_default():

        loss_align_reg, vs_train_op, vs_eval_op, offset_pred, loss_reg = model.construct_model()
        # Create a session for running Ops on the Graph.
        sess = tf.Session()
        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        sess.run(init)
        #max_steps=100000
        for step in xrange(max_steps):
            start_time = time.time()
            feed_dict = model.fill_feed_dict_train_reg()
            _, loss_value, offset_pred_v, loss_reg_v = sess.run([vs_train_op, loss_align_reg, offset_pred, loss_reg], feed_dict=feed_dict)
            duration = time.time() - start_time

            if step % display_step == 0:
                # Print status to stdout.
                print('Step %d: loss = %.3f (%.3f sec)' % (step, loss_value, duration))
            if (step+1)%50000==0:
                current_lr=current_lr*0.1
                model.vs_lr=current_lr
                print (model.vs_lr)
            if (step+1) % test_iter == 0:
                print ("Start to test:-----------------\n")
                movie_length_info=pickle.load(open("./video_allframes_info_charades_new.pkl",'rb'))
                do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, step+1,context_num)

def main(_):
    run_training()


if __name__ == '__main__':
    tf.app.run()