In [1]:
import time
import os
import sys
from __future__ import print_function, division
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import json
from pprint import pprint
from sklearn import cross_validation
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk import bigrams, trigrams
import math

In [3]:
from spellchecker import SpellChecker

# Load the image features

In [4]:
train_salient_image_features = pickle.load(open('../salient_img_feats_train_key_value_fc2', 'rb'))

In [5]:
train_image_features = pickle.load(open('../img_feats_train_key_value_all_fc2', 'rb'))

In [6]:
len(train_image_features)

167391

In [1]:
#salient_validation_image_features = pickle.load(open('../salient_img_feats_validation_key_value_fc2', 'rb'))

In [2]:
#validation_image_features = pickle.load(open('../img_feats_validation_key_value_fc2', 'rb'))

# Load the image description metadata file

In [8]:
train_story_seq = json.load(open('../train.story-in-sequence.json'))

In [3]:
#val_story_seq = json.load(open('../val.story-in-sequence.json'))

# Sort the images with stories

In [9]:
def get_story(annotations):
    story = dict()
    annotations_len = len(annotations)
    previous_story_id = annotations[0][0]['story_id']
    story[previous_story_id] = list()
    for i in range(annotations_len):
        for j in range(len(annotations[i])):
            if previous_story_id != annotations[i][j]['story_id']:
                previous_story_id = annotations[i][j]['story_id']
                story[previous_story_id] = list()
                
            image_des = {'photo_id':annotations[i][j]['photo_flickr_id'], 
                         'description' :annotations[i][j]['text']}
            story[previous_story_id].append(image_des)
            
    return story

In [10]:
def get_unique_story(annotations):
    story_collection = dict()
    annotations_len = len(annotations)
    previous_story_id = annotations[0][0]['story_id']
    #story[previous_story_id] = list()
    story = []
    used_photo = dict()
    used = False
    for i in range(annotations_len):
        for j in range(len(annotations[i])):
            if previous_story_id != annotations[i][j]['story_id']:
                if used == False:
                    story_collection[previous_story_id] = story
                previous_story_id = annotations[i][j]['story_id']
                story = list()
                used = False
            
            photo_id = annotations[i][j]['photo_flickr_id']
            if photo_id in used_photo:
                used = True
            image_des = {'photo_id':photo_id, 
                         'description' :annotations[i][j]['text']}
            used_photo[photo_id] = 1
            story.append(image_des)
            #print(image_des)
            
    return story_collection

In [11]:
train_story = get_unique_story(train_story_seq['annotations'])

In [12]:
#train_story = get_story(train_story_seq['annotations'])

In [13]:
len(train_story)

8831

In [14]:
train_story

{'30355': [{'description': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
   'photo_id': '2627795780'},
  {'description': 'so we decided to take the day to go out and enjoy its beauty .',
   'photo_id': '2626979987'},
  {'description': 'to see the final glimpse of the roots , extending out into the depths of the hill .',
   'photo_id': '2626982337'},
  {'description': 'and its magnificent trunk , larger than life itself .',
   'photo_id': '2626983575'},
  {'description': 'one last picture of its beauty so we could capture it forever .',
   'photo_id': '2626985925'}],
 '30360': [{'description': 'our trip to location last year was filled with beauty . sculptures could be found everywhere we went .',
   'photo_id': '205866755'},
  {'description': 'the architecture was old and interesting .',
   'photo_id': '205862590'},
  {'description': 'since it was spring , beautiful flowers lined the streets . this pink bloom was particularly lovely .',
   'photo_i

# Validation Data

In [15]:
val_story = get_story(val_story_seq['annotations'])

NameError: name 'val_story_seq' is not defined

In [None]:
len(val_story)

# Get the image features of the stories

In [21]:
english_dictionary = dict.fromkeys(words.words(), None)

In [22]:
g_spellChecker = SpellChecker()

In [23]:
def is_english_word(word):
    try:
        x = english_dictionary[word]
        return True
    except KeyError:
        return False

In [24]:
def preprocess_sentence(sent):
    #print("sent",sent)
    sent = sent.lower()
    sent = sent.strip()
    sent = sent.replace(',', '')
    sent = sent.replace('.', '')
    sent = sent.replace('"', '')
    sent = sent.replace('[', '')
    sent = sent.replace(']', '')
    sent = sent.replace('?', '')
    sent = sent.replace('!', '')
    sent = sent.replace(':', '')
    sent = sent.replace(';', '')
    sent = sent.replace('-', ' ')
    sent = sent.strip()
    
    tokens = sent.split(" ")
    #print("token", tokens)
    words = []
    for token in tokens:
        syn = wordnet.synsets(token)
        if is_english_word(token) or len(syn) > 0:
            words.append(token)
        else:
            c_token = g_spellChecker.correction(token)
            words.append(c_token)
            #c_syn = wordnet.synsets(c_token)
            #if is_english_word(c_token) or len(c_syn) > 0:
            #    words.append(c_token)
    
    pos = nltk.pos_tag(words)
    pos_words = []
    for p in pos:
        #if p[1] != "TO" and p[1] !="IN" and p[1] != "CD" and p[1] != "DT":
        if p[1] != "CD":
            pos_words.append(p[0])
    lema = WordNetLemmatizer()

    new_sent = ""
    for word in pos_words:
        new_word = lema.lemmatize(word)
        #new_sent = new_sent + " "+ new_word
        if is_english_word(new_word):
            new_sent = new_sent + " "+ new_word
        else:
            new_sent = new_sent + " "+ word
    
    #print(new_sent)
    new_sent = new_sent.strip()
    
    return new_sent

In [25]:
def story_image_features(story_data, image_features, salient_image_features):
    image_features_len = len(image_features)
    story_features = dict()
    idx = 0
    for key, value in story_data.items():
        if idx == 1000:
            break
        print(idx)
        idx +=1
        
        exist = True
        for i in range(len(value)):
            photo_id = value[i]['photo_id']
            if photo_id in image_features and photo_id in salient_image_features:
                value[i]['features'] = image_features[photo_id]
                value[i]['description'] = preprocess_sentence(value[i]['description'])
            else:
                exist = False
                break
        if exist == True:
            story_features[key] = value
            
    return story_features

In [26]:
#train_story_features = story_image_features(train_story, train_image_features, train_salient_image_features)

In [29]:
#pickle.dump(train_story_features, open("train_story_features_1000", 'wb'))

In [30]:
train_story_features = pickle.load(open('train_story_features_1000', 'rb'))

In [31]:
#train_story_features = pickle.load(open('train_unique_story_features', 'rb'))

# Validation Story Feature

In [32]:
#val_story_features = story_image_features(val_story, validation_image_features, salient_validation_image_features)

In [33]:
#pickle.dump(val_story_features, open("val_story_features", 'wb'))

In [34]:
#val_story_features = pickle.load(open('val_story_features', 'rb'))

In [35]:
#len(val_story_features)

# Vocabulary Builder

In [36]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=3):
    # borrowed this function from NeuralTalk
    print ('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ))

    word_counts = {}
    nsents = 0

    for sent in sentence_iterator:
        nsents += 1
        tmp_sent = sent.lower().split(' ')
        if '' in tmp_sent:
            tmp_sent.remove('')
        
        # bigram
        #for w1,w2 in bigrams(tmp_sent):
        #    w = w1 + " "+ w2
        #    if w !='':
        #        word_counts[w] = word_counts.get(w, 0) + 1

        for w in tmp_sent:
            if w !='':
                word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print ('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<bos>'
    ixtoword[1] = '<eos>'
    ixtoword[2] = '<pad>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<bos>'] = 0
    wordtoix['<eos>'] = 1
    wordtoix['<pad>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx + 4
        ixtoword[idx+4] = w

    word_counts['<eos>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<pad>'] = nsents
    word_counts['<unk>'] = nsents


    return wordtoix, ixtoword

In [37]:
def get_all_sentences(data):
    all_sentences = []
    for key, value in data.items():
        length = len(value)
        for i in range(length):
            sent = value[i]['description']
            #sent = preprocess_sentence(sent)
            all_sentences.append(sent)
            
    return all_sentences


In [38]:
sentences = get_all_sentences(train_story_features)

In [39]:
sentences

['our landmark tree in town wa about to be destroyed and cleared for a new mall',
 'so we decided to take the day to go out and enjoy it beauty',
 'to see the final glimpse of the root a extending out into the depth of the hill',
 'and it magnificent trunk a larger than life itself',
 'last picture of it beauty so we could capture it forever',
 'our trip to location last year wa filled with beauty a sculpture could be found everywhere we went',
 'the architecture wa old and interesting',
 'since it wa spring a beautiful flower lined the street a this pink bloom wa particularly lovely',
 'we went to a museum and saw this odd collection of hanging face',
 "i d have to say my favorite part of the trip would have to be the food though a we ca n't wait to go back again",
 'male and i were excited to be in location location during the of july',
 'there wa a huge crowd of people already awaiting the firework show',
 'we were lucky to find a nice spot on the grass to watch the show',
 'a the e

In [40]:
word2idx, idx2word = preProBuildWordVocab(sentences, 2)

preprocessing word counts and creating vocab based on word count threshold 2
filtered words from 4082 to 2151


In [41]:
word2idx

{'<bos>': 0,
 '<eos>': 1,
 '<pad>': 2,
 '<unk>': 3,
 'our': 4,
 'landmark': 5,
 'tree': 6,
 'in': 7,
 'town': 8,
 'wa': 9,
 'about': 10,
 'to': 11,
 'be': 12,
 'and': 13,
 'cleared': 14,
 'for': 15,
 'a': 16,
 'new': 17,
 'mall': 18,
 'so': 19,
 'we': 20,
 'decided': 21,
 'take': 22,
 'the': 23,
 'day': 24,
 'go': 25,
 'out': 26,
 'enjoy': 27,
 'it': 28,
 'beauty': 29,
 'see': 30,
 'final': 31,
 'glimpse': 32,
 'of': 33,
 'root': 34,
 'into': 35,
 'hill': 36,
 'magnificent': 37,
 'trunk': 38,
 'larger': 39,
 'than': 40,
 'life': 41,
 'itself': 42,
 'last': 43,
 'picture': 44,
 'could': 45,
 'capture': 46,
 'forever': 47,
 'trip': 48,
 'location': 49,
 'year': 50,
 'filled': 51,
 'with': 52,
 'sculpture': 53,
 'found': 54,
 'everywhere': 55,
 'went': 56,
 'architecture': 57,
 'old': 58,
 'interesting': 59,
 'since': 60,
 'spring': 61,
 'beautiful': 62,
 'flower': 63,
 'lined': 64,
 'street': 65,
 'this': 66,
 'pink': 67,
 'bloom': 68,
 'lovely': 69,
 'museum': 70,
 'saw': 71,
 'odd': 72

In [42]:
vocab_size = len(word2idx)

In [43]:
vocab_size

2155

# Global Variable

In [44]:
per_image_vector_dimension = 4096
feature_dimension = per_image_vector_dimension
#state_size = 512
n_words = len(word2idx)
wordRNN_lstm_dim = 512
word_embed_dim = 512
batch_size = 24
learning_rate = 0.01
n_epochs = 1501
project_dim = 1024
image_fc_dim = 512
sentRNN_lstm_dim = 1024
sentRNN_FC_dim = 1024
N_max_word = 30
n_image = 5
beam_width = 10

# Extract training features and label

In [45]:
def get_caption_matrix(img_sent):
    img_captions_matrix = np.ones([ N_max_word+1], dtype=np.int32) * 1
    img_captions_matrix[0] = 0
    #img_sent = preprocess_sentence(img_sent)
    for idx, word in enumerate(img_sent.lower().split(' ')):
        # because the biggest number of words in a sentence is N_max, here is 50
        if idx == N_max_word:
            break
            
        if word in word2idx:
            img_captions_matrix[ idx+1] = word2idx[word]
        else:
            img_captions_matrix[ idx+1] = word2idx['<unk>']
    
    return img_captions_matrix

In [46]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

# Variable

In [47]:
regionPooling_W1 = tf.Variable(tf.random_uniform([feature_dimension, image_fc_dim], -0.1, 0.1))
regionPooling_b1 = tf.Variable(tf.zeros([image_fc_dim]))

regionPooling_W2 = tf.Variable(tf.random_uniform([feature_dimension, image_fc_dim], -0.1, 0.1))
regionPooling_b2 = tf.Variable(tf.zeros([image_fc_dim]))

img_embedding = tf.Variable(tf.random_uniform([feature_dimension, wordRNN_lstm_dim], -0.1, 0.1))
img_embedding_bias = tf.Variable(tf.zeros([wordRNN_lstm_dim])) 

fc1_W = tf.Variable(tf.random_uniform([sentRNN_lstm_dim, sentRNN_FC_dim], -0.1, 0.1))
fc1_b = tf.Variable(tf.zeros(sentRNN_FC_dim))
fc2_W = tf.Variable(tf.random_uniform([sentRNN_FC_dim, 1024], -0.1, 0.1))
fc2_b = tf.Variable(tf.zeros(1024))

embed_word_W = tf.Variable(tf.random_uniform([wordRNN_lstm_dim, n_words], -0.1,0.1))
embed_word_b = tf.Variable(tf.zeros([n_words]))

W_embeddings = tf.Variable(tf.random_uniform([n_words, word_embed_dim], -0.1, 0.1))

# Placeholder

In [48]:

    #tmp_feats = tf.reshape(feats, [-1, self.feats_dim])


# Topic RNN

In [49]:
def lstm_cell(size):
    return tf.nn.rnn_cell.BasicLSTMCell(size, reuse=tf.AUTO_REUSE)

In [51]:
sent_LSTM = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(sentRNN_lstm_dim) for _ in range(2)])

# Word RNN

In [53]:
word_LSTM_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(wordRNN_lstm_dim) for _ in range(2)])

In [1]:
def build_model():
    loss_word = 0.0
    acc_sent = 0.0
    batch_feats = tf.placeholder(tf.float32, [batch_size, n_image, feature_dimension])
    batch_salient_feats = tf.placeholder(tf.float32, [batch_size, n_image, feature_dimension])
    batch_caption = tf.placeholder(tf.int32, [batch_size, n_image, N_max_word + 1])
    #batch_captions_masks = tf.placeholder(tf.float32, [batch_size, n_image, N_max_word + 1])
    
    sent_state = sent_LSTM.zero_state(batch_size=batch_size, dtype=tf.float32)
    for i in range(n_image):
        # topic RNN
        feats = batch_feats[:,i:i+1,:]
        salient_feats = batch_salient_feats[:,i:i+1,:]
        caption = batch_caption[:,i:i+1,:]
        #caption_mask = batch_captions_masks[:,i:i+1,:]
        
        feats = tf.reshape(feats, [batch_size, -1])
        salient_feats = tf.reshape(salient_feats, [batch_size, -1])
        caption = tf.reshape(caption, [batch_size, -1])
        #caption_mask = tf.reshape(caption_mask, [batch_size, -1])
        
        with tf.variable_scope('dense'):
            project_vec1 = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
            project_vec2 = tf.nn.relu(tf.matmul(salient_feats, regionPooling_W2) + regionPooling_b2)
            project_vec = tf.concat([project_vec1, project_vec2], 1)
    
        with tf.variable_scope('image_context'):
            sent_output, sent_state = sent_LSTM(project_vec, sent_state)
        with tf.name_scope('fc1'):
            hidden1 = tf.nn.relu( tf.matmul(sent_output, fc1_W) + fc1_b)
        with tf.name_scope('fc2'):
            sent_topic_vec = tf.nn.relu( tf.matmul(hidden1, fc2_W) + fc2_b)


        state = tf.nn.rnn_cell.LSTMStateTuple(sent_topic_vec[:, 0:512], sent_topic_vec[:, 512:])
        word_state = (state, state)

        #image_embedding = tf.matmul(feats, img_embedding) + img_embedding_bias


        acc_word = 0.0
        
        for j in range(0, N_max_word):
            if j > 0:
                tf.get_variable_scope().reuse_variables()                
                
            current_embed = tf.nn.embedding_lookup(W_embeddings, caption[:, j])
            with tf.variable_scope('word'):
                word_output, word_state = word_LSTM_cell(current_embed, word_state)
            
            indices = caption[:,j+1]
            onehot_labels = tf.one_hot(indices, depth = n_words)
            logit_words = tf.nn.xw_plus_b(word_output[:], embed_word_W, embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logit_words, labels = onehot_labels)
            #cross_entropy = cross_entropy * caption_mask[:, j]

            prediction = tf.nn.softmax(logit_words)

            correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(onehot_labels, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            acc_word += accuracy
            loss_wordRNN = tf.reduce_sum(cross_entropy) / batch_size
            loss_word += loss_wordRNN
        
        acc_word = acc_word/N_max_word
        acc_sent += acc_word
    acc_sent /= n_image
    loss_word = loss_word/n_image
    return batch_feats, batch_salient_feats, batch_caption, loss_word, acc_sent


In [55]:
model_path = '../model_batch_v2/'

In [55]:
train_story_features

{'0': [{'description': 'male and i were excited to be in location location during the of july',
   'features': array([[0.       , 2.249732 , 0.       , ..., 0.       , 1.7725334,
           0.       ]], dtype=float32),
   'photo_id': '997622638'},
  {'description': 'there wa a huge crowd of people already awaiting the firework show',
   'features': array([[3.5998712 , 1.1743027 , 0.        , ..., 0.        , 0.16532627,
           0.        ]], dtype=float32),
   'photo_id': '997623170'},
  {'description': 'we were lucky to find a nice spot on the grass to watch the show',
   'features': array([[1.6341993, 0.9670204, 0.       , ..., 0.       , 0.       ,
           0.       ]], dtype=float32),
   'photo_id': '997624052'},
  {'description': 'a the evening grew darker the crowd wa gearing up to enjoy the show a with a great view of the location location',
   'features': array([[0.23788875, 0.8058404 , 0.02856672, ..., 0.        , 0.        ,
           0.8887316 ]], dtype=float32),
   'p

# Train the model

In [56]:
def train_model():
    tf_feats, tf_salient_feats, tf_caption, tf_loss, tf_acc = build_model()
    sess = tf.InteractiveSession()
    
    saver = tf.train.Saver(max_to_keep=100, write_version=1)
    
    #pre_model_path = './models_batch/_model_partials_out_imgemb-175'

    #saver.restore(sess, pre_model_path)
    
    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss)
    
    tf.global_variables_initializer().run()
    
    data_length = len(train_story_features)
    for epoch in range(0, n_epochs):
            #train_story_features_keys = list(train_story_features.items())
            #np.random.shuffle(train_story_features_keys)
            #shuffled_train_story_features = dict(train_story_features_keys)
        
            loss = 0;
            acc = 0;
            total_pass = 0
            start_time = time.time()
            n_item = 0
            features = []
            salient_features = []
            caption_matrix = []
            passed_data = 0
            for key, value in train_story_features.items():
                if n_item < batch_size:
                    n_item += 1
                    for i in range(n_image):
                        description = value[i]['description']
                        original_features = value[i]['features']
                        original_features = original_features.flatten()
                        
                        _salient_features = train_salient_image_features[value[i]['photo_id']]
                        _salient_features = _salient_features.flatten()
                        
                        #_features = np.concatenate([original_features, _salient_features])
                        
                        _caption_matrix = get_caption_matrix(description)
                        features.append(original_features)
                        salient_features.append(_salient_features)
                        caption_matrix.append(_caption_matrix)
                    
                if n_item == batch_size:    
                    features = np.array(features)
                    salient_features = np.array(salient_features)
                    caption_matrix = np.array(caption_matrix)
                    
                    features = np.asarray(np.reshape(features,(batch_size, n_image, -1)))
                    salient_features = np.asarray(np.reshape(salient_features,(batch_size, n_image, -1)))
                    
                    caption_matrix = np.asarray(np.reshape(caption_matrix, (batch_size, n_image, -1)))
                    captions_masks = np.zeros( (caption_matrix.shape[0], caption_matrix.shape[1], caption_matrix.shape[2]) )

                    _, loss_word, acc_word = sess.run(
                        [train_op, tf_loss, tf_acc],
                        feed_dict={
                            tf_feats: features,
                            tf_salient_feats: salient_features,
                            tf_caption: caption_matrix
                        })

                    loss = loss + loss_word
                    acc = acc+acc_word
                    total_pass = total_pass + 1
                    n_item = 0
                    features = []
                    salient_features = []
                    caption_matrix = []
                    # running information
            
            loss = loss/total_pass
            acc = acc/total_pass
            
            print ('Epoch: ', epoch, ' loss: ', loss, " accuracy: ", acc, ' Time cost: ', str((time.time() - start_time)))
            
            if np.mod(epoch, 100) == 0:
                lossFileName = "../result/base_model_v2_loss"+str(epoch)+".txt"
                test_fd = open(lossFileName, 'w')
                test_fd.write('Epoch: '+ str(epoch) + ' loss: ' + str(loss)+ " accuracy: "+ str(acc)+"\n")
                test_fd.close()
                print ("Epoch ", epoch, " is done. Saving the model ...")
                saver.save(sess, os.path.join(model_path, 'base_model_v2'), global_step=epoch)
    
    

In [57]:
train_model()

Epoch:  0  loss:  93.69537048339843  accuracy:  0.6327638814982492  Time cost:  16.705910444259644
Epoch  0  is done. Saving the model ...
Epoch:  1  loss:  74.26164112091064  accuracy:  0.6536041498184204  Time cost:  11.147481918334961
Epoch:  2  loss:  69.35957298278808  accuracy:  0.6545277625322342  Time cost:  11.542245149612427
Epoch:  3  loss:  66.96503591537476  accuracy:  0.6553055480122566  Time cost:  11.435081005096436
Epoch:  4  loss:  65.56556453704835  accuracy:  0.6558194383978844  Time cost:  11.173795938491821
Epoch:  5  loss:  64.65470333099366  accuracy:  0.6561111062765121  Time cost:  11.657464504241943
Epoch:  6  loss:  64.01962051391601  accuracy:  0.6567291617393494  Time cost:  11.541757583618164
Epoch:  7  loss:  63.56396398544312  accuracy:  0.6570972099900245  Time cost:  11.283647060394287
Epoch:  8  loss:  63.23500995635986  accuracy:  0.6575902596116066  Time cost:  11.699789762496948
Epoch:  9  loss:  62.991220760345456  accuracy:  0.6578124895691871  

Epoch:  78  loss:  56.45160779953003  accuracy:  0.6805208310484886  Time cost:  10.953658103942871
Epoch:  79  loss:  56.271884536743165  accuracy:  0.6815416529774666  Time cost:  10.95514965057373
Epoch:  80  loss:  56.24244899749756  accuracy:  0.6814722210168839  Time cost:  10.916445970535278
Epoch:  81  loss:  55.93648509979248  accuracy:  0.6823055505752563  Time cost:  11.039831161499023
Epoch:  82  loss:  55.81977109909057  accuracy:  0.6822708263993263  Time cost:  11.475056171417236
Epoch:  83  loss:  55.70872869491577  accuracy:  0.6827499955892563  Time cost:  11.43795394897461
Epoch:  84  loss:  55.63353261947632  accuracy:  0.6834444314241409  Time cost:  11.7081937789917
Epoch:  85  loss:  55.37062120437622  accuracy:  0.684486098587513  Time cost:  11.321894884109497
Epoch:  86  loss:  55.17069501876831  accuracy:  0.6851041540503502  Time cost:  11.28459906578064
Epoch:  87  loss:  55.14987373352051  accuracy:  0.6852916702628136  Time cost:  11.47421145439148
Epoch:

Epoch:  156  loss:  37.968877983093265  accuracy:  0.7524861201643944  Time cost:  11.603657245635986
Epoch:  157  loss:  37.95950975418091  accuracy:  0.7521666675806046  Time cost:  11.585432529449463
Epoch:  158  loss:  37.3506751537323  accuracy:  0.7552153021097183  Time cost:  11.208277940750122
Epoch:  159  loss:  37.08076543807984  accuracy:  0.7575347363948822  Time cost:  11.275441646575928
Epoch:  160  loss:  36.52598900794983  accuracy:  0.7604861244559288  Time cost:  11.343596935272217
Epoch:  161  loss:  36.664989948272705  accuracy:  0.7589166760444641  Time cost:  11.651145935058594
Epoch:  162  loss:  36.04447412490845  accuracy:  0.7624652832746506  Time cost:  11.299216270446777
Epoch:  163  loss:  35.51041965484619  accuracy:  0.7655277907848358  Time cost:  11.24261999130249
Epoch:  164  loss:  35.691876363754275  accuracy:  0.7630277961492539  Time cost:  11.3781578540802
Epoch:  165  loss:  35.25334568023682  accuracy:  0.7652847230434418  Time cost:  11.4297623

Epoch:  233  loss:  14.876135396957398  accuracy:  0.8952986359596252  Time cost:  11.540312051773071
Epoch:  234  loss:  14.76489794254303  accuracy:  0.8956250205636025  Time cost:  10.95331072807312
Epoch:  235  loss:  14.753729248046875  accuracy:  0.8950347557663918  Time cost:  10.969216108322144
Epoch:  236  loss:  14.222083854675294  accuracy:  0.8998611301183701  Time cost:  10.919017314910889
Epoch:  237  loss:  14.08916461467743  accuracy:  0.9016319707036018  Time cost:  10.94241976737976
Epoch:  238  loss:  13.968853425979614  accuracy:  0.9015694677829742  Time cost:  11.516650438308716
Epoch:  239  loss:  14.025086379051208  accuracy:  0.9006527870893478  Time cost:  11.749682664871216
Epoch:  240  loss:  13.499540781974792  accuracy:  0.9055764243006706  Time cost:  11.403688907623291
Epoch:  241  loss:  12.557040214538574  accuracy:  0.9147639244794845  Time cost:  11.4489266872406
Epoch:  242  loss:  12.100983333587646  accuracy:  0.918868075311184  Time cost:  11.468

Epoch:  310  loss:  4.190629327297211  accuracy:  0.9868888989090919  Time cost:  11.359064817428589
Epoch:  311  loss:  4.1556078553199765  accuracy:  0.9870208367705345  Time cost:  11.33843207359314
Epoch:  312  loss:  4.139774912595749  accuracy:  0.9868055701255798  Time cost:  11.365544080734253
Epoch:  313  loss:  4.109691005945206  accuracy:  0.9871180668473244  Time cost:  11.56591796875
Epoch:  314  loss:  4.054301941394806  accuracy:  0.9877222314476967  Time cost:  11.379438877105713
Epoch:  315  loss:  3.975873500108719  accuracy:  0.9883819475769997  Time cost:  11.387728691101074
Epoch:  316  loss:  3.914079910516739  accuracy:  0.9888888984918595  Time cost:  11.319272994995117
Epoch:  317  loss:  3.8661791145801545  accuracy:  0.9892361223697662  Time cost:  11.480429410934448
Epoch:  318  loss:  3.8449325025081635  accuracy:  0.9890833333134651  Time cost:  11.471980571746826
Epoch:  319  loss:  3.8624820291996  accuracy:  0.9890069648623466  Time cost:  11.4090201854

Epoch:  391  loss:  1.845092198252678  accuracy:  0.9990416586399078  Time cost:  10.753101348876953
Epoch:  392  loss:  1.8744569689035415  accuracy:  0.9987222194671631  Time cost:  10.90900206565857
Epoch:  393  loss:  2.038019222021103  accuracy:  0.9973819389939308  Time cost:  11.042577266693115
Epoch:  394  loss:  7.7642228424549105  accuracy:  0.9478541746735573  Time cost:  10.694814682006836
Epoch:  395  loss:  12.841163432598114  accuracy:  0.8946111515164376  Time cost:  11.455444574356079
Epoch:  396  loss:  7.377690106630325  accuracy:  0.9426180645823479  Time cost:  11.562642335891724
Epoch:  397  loss:  4.614638912677765  accuracy:  0.9730208501219749  Time cost:  11.528618097305298
Epoch:  398  loss:  3.2084993422031403  accuracy:  0.9887569576501847  Time cost:  11.70371389389038
Epoch:  399  loss:  2.5556455314159394  accuracy:  0.9956389024853707  Time cost:  11.242316484451294
Epoch:  400  loss:  2.2253456354141234  accuracy:  0.9979722395539283  Time cost:  11.45

Epoch:  468  loss:  14.541648872196674  accuracy:  0.8971736237406731  Time cost:  11.279856204986572
Epoch:  469  loss:  15.102859401702881  accuracy:  0.8748819753527641  Time cost:  11.692969560623169
Epoch:  470  loss:  8.309198063611984  accuracy:  0.9322014287114143  Time cost:  11.501603603363037
Epoch:  471  loss:  4.753639042377472  accuracy:  0.968493077158928  Time cost:  11.158877611160278
Epoch:  472  loss:  2.9802793085575106  accuracy:  0.9883611097931861  Time cost:  11.179734230041504
Epoch:  473  loss:  2.2084812372922897  accuracy:  0.9955069556832313  Time cost:  11.169175624847412
Epoch:  474  loss:  1.8296441793441773  accuracy:  0.9982638895511627  Time cost:  11.323978185653687
Epoch:  475  loss:  1.6223147869110108  accuracy:  0.9991944462060929  Time cost:  11.416124820709229
Epoch:  476  loss:  1.4938540875911712  accuracy:  0.9996319442987442  Time cost:  11.285335540771484
Epoch:  477  loss:  1.412571868300438  accuracy:  0.999812500178814  Time cost:  11.3

Epoch:  545  loss:  0.7920555487275124  accuracy:  0.9999930545687675  Time cost:  11.284476280212402
Epoch:  546  loss:  0.7892311230301857  accuracy:  1.0  Time cost:  11.441829919815063
Epoch:  547  loss:  0.7869292855262756  accuracy:  0.9999930545687675  Time cost:  11.250295162200928
Epoch:  548  loss:  0.7843574777245521  accuracy:  0.9999930575489998  Time cost:  10.76186466217041
Epoch:  549  loss:  0.7809192061424255  accuracy:  0.9999930545687675  Time cost:  10.910930156707764
Epoch:  550  loss:  0.7777432486414909  accuracy:  1.0  Time cost:  10.791407346725464
Epoch:  551  loss:  0.7744545593857766  accuracy:  1.0  Time cost:  10.732656955718994
Epoch:  552  loss:  0.7712356582283973  accuracy:  1.0  Time cost:  11.196471452713013
Epoch:  553  loss:  0.7675632283091545  accuracy:  1.0  Time cost:  11.297125577926636
Epoch:  554  loss:  0.7646282210946083  accuracy:  0.9999861121177673  Time cost:  11.489701747894287
Epoch:  555  loss:  0.763864167034626  accuracy:  0.9999

Epoch:  626  loss:  0.7380023628473282  accuracy:  1.0  Time cost:  11.312575340270996
Epoch:  627  loss:  0.7269412577152252  accuracy:  1.0  Time cost:  11.536757469177246
Epoch:  628  loss:  0.7170542493462563  accuracy:  1.0  Time cost:  11.225510358810425
Epoch:  629  loss:  0.7078375115990638  accuracy:  1.0  Time cost:  11.129649877548218
Epoch:  630  loss:  0.6993855133652687  accuracy:  1.0  Time cost:  11.232449769973755
Epoch:  631  loss:  0.6917202696204185  accuracy:  0.9999930545687675  Time cost:  11.273803949356079
Epoch:  632  loss:  0.6847028329968452  accuracy:  1.0  Time cost:  11.434580326080322
Epoch:  633  loss:  0.6778605997562408  accuracy:  0.9999930545687675  Time cost:  11.080597162246704
Epoch:  634  loss:  0.6713814288377762  accuracy:  1.0  Time cost:  11.423563241958618
Epoch:  635  loss:  0.6651303544640541  accuracy:  1.0  Time cost:  11.417576313018799
Epoch:  636  loss:  0.6592438697814942  accuracy:  1.0  Time cost:  11.274325847625732
Epoch:  637  

Epoch:  713  loss:  0.48293874189257624  accuracy:  1.0  Time cost:  11.414988040924072
Epoch:  714  loss:  0.48172525092959406  accuracy:  0.9999930575489998  Time cost:  11.498636245727539
Epoch:  715  loss:  0.4804996713995934  accuracy:  1.0  Time cost:  11.445386171340942
Epoch:  716  loss:  0.47922908663749697  accuracy:  1.0  Time cost:  10.961040258407593
Epoch:  717  loss:  0.4779812604188919  accuracy:  1.0  Time cost:  11.335299491882324
Epoch:  718  loss:  0.4765664882957935  accuracy:  1.0  Time cost:  11.277356386184692
Epoch:  719  loss:  0.4752585582435131  accuracy:  1.0  Time cost:  11.4427330493927
Epoch:  720  loss:  0.47399466410279273  accuracy:  1.0  Time cost:  11.407306671142578
Epoch:  721  loss:  0.47284567505121233  accuracy:  1.0  Time cost:  11.491624593734741
Epoch:  722  loss:  0.47166754603385924  accuracy:  1.0  Time cost:  11.31515908241272
Epoch:  723  loss:  0.47057979479432105  accuracy:  1.0  Time cost:  11.158564805984497
Epoch:  724  loss:  0.46

Epoch:  801  loss:  0.4009224086999893  accuracy:  0.9999791666865349  Time cost:  11.46064019203186
Epoch:  802  loss:  0.4064404658973217  accuracy:  0.9999444514513016  Time cost:  11.207006216049194
Epoch:  803  loss:  0.4226831212639809  accuracy:  0.9998680561780929  Time cost:  11.207888841629028
Epoch:  804  loss:  0.426497495919466  accuracy:  0.9998541653156281  Time cost:  11.344192028045654
Epoch:  805  loss:  0.4704729713499546  accuracy:  0.9996319472789764  Time cost:  11.22529935836792
Epoch:  806  loss:  24.13776591718197  accuracy:  0.8398611232638359  Time cost:  11.653404712677002
Epoch:  807  loss:  10.17131096124649  accuracy:  0.9107361525297165  Time cost:  11.280617237091064
Epoch:  808  loss:  5.745305424928665  accuracy:  0.9513958677649498  Time cost:  11.350697994232178
Epoch:  809  loss:  3.2913307547569275  accuracy:  0.9775625258684159  Time cost:  11.346336841583252
Epoch:  810  loss:  2.0669576823711395  accuracy:  0.9904236271977425  Time cost:  11.38

Epoch:  891  loss:  0.3556245841085911  accuracy:  1.0  Time cost:  11.189001083374023
Epoch:  892  loss:  0.3546211116015911  accuracy:  1.0  Time cost:  11.317133903503418
Epoch:  893  loss:  0.3536445744335651  accuracy:  1.0  Time cost:  11.087568998336792
Epoch:  894  loss:  0.3526694841682911  accuracy:  1.0  Time cost:  11.258339405059814
Epoch:  895  loss:  0.35169595032930373  accuracy:  1.0  Time cost:  11.163751363754272
Epoch:  896  loss:  0.3507525369524956  accuracy:  1.0  Time cost:  11.15533709526062
Epoch:  897  loss:  0.3498383104801178  accuracy:  1.0  Time cost:  11.3710298538208
Epoch:  898  loss:  0.3489571936428547  accuracy:  1.0  Time cost:  11.567574501037598
Epoch:  899  loss:  0.3483177602291107  accuracy:  1.0  Time cost:  11.391560792922974
Epoch:  900  loss:  0.34981354251503943  accuracy:  0.9999930575489998  Time cost:  11.117319345474243
Epoch  900  is done. Saving the model ...
Epoch:  901  loss:  0.3520865745842457  accuracy:  0.9999722212553024  Tim

Epoch:  975  loss:  0.3868561886250973  accuracy:  1.0  Time cost:  11.249455213546753
Epoch:  976  loss:  0.3798186704516411  accuracy:  1.0  Time cost:  11.153075218200684
Epoch:  977  loss:  0.37367745861411095  accuracy:  1.0  Time cost:  11.175544023513794
Epoch:  978  loss:  0.36820119693875314  accuracy:  1.0  Time cost:  11.179431915283203
Epoch:  979  loss:  0.36326286718249323  accuracy:  1.0  Time cost:  11.326072931289673
Epoch:  980  loss:  0.35879027023911475  accuracy:  1.0  Time cost:  11.1874520778656
Epoch:  981  loss:  0.35471118465065954  accuracy:  1.0  Time cost:  11.441522121429443
Epoch:  982  loss:  0.35096682980656624  accuracy:  1.0  Time cost:  11.520817041397095
Epoch:  983  loss:  0.34750352129340173  accuracy:  1.0  Time cost:  11.154169082641602
Epoch:  984  loss:  0.3442887492477894  accuracy:  1.0  Time cost:  11.119759321212769
Epoch:  985  loss:  0.3413091726601124  accuracy:  1.0  Time cost:  11.176517248153687
Epoch:  986  loss:  0.338523855060339 

Epoch:  1064  loss:  0.2638943076133728  accuracy:  1.0  Time cost:  11.644572257995605
Epoch:  1065  loss:  0.2634118732064962  accuracy:  1.0  Time cost:  11.351595401763916
Epoch:  1066  loss:  0.2629224516451359  accuracy:  1.0  Time cost:  11.207430124282837
Epoch:  1067  loss:  0.2624463677406311  accuracy:  1.0  Time cost:  11.113624572753906
Epoch:  1068  loss:  0.261964288726449  accuracy:  1.0  Time cost:  11.339890956878662
Epoch:  1069  loss:  0.2614953961223364  accuracy:  1.0  Time cost:  11.501258850097656
Epoch:  1070  loss:  0.26101820841431617  accuracy:  1.0  Time cost:  11.596021175384521
Epoch:  1071  loss:  0.2605559717863798  accuracy:  1.0  Time cost:  11.306095838546753
Epoch:  1072  loss:  0.2600863929837942  accuracy:  1.0  Time cost:  11.476927757263184
Epoch:  1073  loss:  0.25963200628757477  accuracy:  1.0  Time cost:  11.297392845153809
Epoch:  1074  loss:  0.25916868522763253  accuracy:  1.0  Time cost:  11.336559534072876
Epoch:  1075  loss:  0.2587205

Epoch:  1152  loss:  0.23017035461962224  accuracy:  1.0  Time cost:  11.22416377067566
Epoch:  1153  loss:  0.2298598326742649  accuracy:  1.0  Time cost:  11.132009267807007
Epoch:  1154  loss:  0.22954909466207027  accuracy:  1.0  Time cost:  11.352656364440918
Epoch:  1155  loss:  0.22924202382564546  accuracy:  1.0  Time cost:  11.150780200958252
Epoch:  1156  loss:  0.2289319109171629  accuracy:  1.0  Time cost:  11.134042739868164
Epoch:  1157  loss:  0.22862989120185376  accuracy:  1.0  Time cost:  11.321591854095459
Epoch:  1158  loss:  0.22831957861781121  accuracy:  1.0  Time cost:  11.242326736450195
Epoch:  1159  loss:  0.2280202552676201  accuracy:  1.0  Time cost:  11.213241338729858
Epoch:  1160  loss:  0.22770866118371486  accuracy:  1.0  Time cost:  11.402744770050049
Epoch:  1161  loss:  0.2274121817201376  accuracy:  1.0  Time cost:  11.361698150634766
Epoch:  1162  loss:  0.22709839306771756  accuracy:  1.0  Time cost:  11.345197439193726
Epoch:  1163  loss:  0.226

Epoch:  1240  loss:  0.20613686367869377  accuracy:  1.0  Time cost:  11.18151068687439
Epoch:  1241  loss:  0.20587800443172455  accuracy:  1.0  Time cost:  11.401054620742798
Epoch:  1242  loss:  0.20563308112323284  accuracy:  1.0  Time cost:  11.401770114898682
Epoch:  1243  loss:  0.2053846050053835  accuracy:  1.0  Time cost:  11.346813440322876
Epoch:  1244  loss:  0.20514977499842643  accuracy:  1.0  Time cost:  11.494219303131104
Epoch:  1245  loss:  0.20490427687764168  accuracy:  1.0  Time cost:  11.43490743637085
Epoch:  1246  loss:  0.20467232093214988  accuracy:  1.0  Time cost:  11.375746726989746
Epoch:  1247  loss:  0.20443523079156875  accuracy:  1.0  Time cost:  11.397153615951538
Epoch:  1248  loss:  0.20421020835638046  accuracy:  1.0  Time cost:  11.514979839324951
Epoch:  1249  loss:  0.20397300347685815  accuracy:  1.0  Time cost:  11.557866096496582
Epoch:  1250  loss:  0.2037486147135496  accuracy:  1.0  Time cost:  11.446372985839844
Epoch:  1251  loss:  0.20

Epoch:  1328  loss:  0.187117987498641  accuracy:  1.0  Time cost:  11.421359539031982
Epoch:  1329  loss:  0.18692852146923541  accuracy:  1.0  Time cost:  11.000552415847778
Epoch:  1330  loss:  0.18672543168067932  accuracy:  1.0  Time cost:  11.286333560943604
Epoch:  1331  loss:  0.1865343250334263  accuracy:  1.0  Time cost:  11.258042335510254
Epoch:  1332  loss:  0.18634050861001014  accuracy:  1.0  Time cost:  11.377825260162354
Epoch:  1333  loss:  0.1861615192145109  accuracy:  1.0  Time cost:  11.75465202331543
Epoch:  1334  loss:  0.18597186915576458  accuracy:  1.0  Time cost:  12.56294298171997
Epoch:  1335  loss:  0.18578815199434756  accuracy:  1.0  Time cost:  12.659972667694092
Epoch:  1336  loss:  0.1855973556637764  accuracy:  1.0  Time cost:  12.698182106018066
Epoch:  1337  loss:  0.18540979996323587  accuracy:  1.0  Time cost:  12.6948983669281
Epoch:  1338  loss:  0.18522572480142116  accuracy:  1.0  Time cost:  12.72257661819458
Epoch:  1339  loss:  0.18504253

Epoch:  1412  loss:  0.23200639374554158  accuracy:  1.0  Time cost:  11.202764511108398
Epoch:  1413  loss:  0.2303909007459879  accuracy:  1.0  Time cost:  11.277398586273193
Epoch:  1414  loss:  0.2288014031946659  accuracy:  1.0  Time cost:  11.365798950195312
Epoch:  1415  loss:  0.22729178406298162  accuracy:  1.0  Time cost:  11.381251811981201
Epoch:  1416  loss:  0.22583226077258586  accuracy:  1.0  Time cost:  12.462479829788208
Epoch:  1417  loss:  0.22444156259298326  accuracy:  1.0  Time cost:  12.602348327636719
Epoch:  1418  loss:  0.22309618890285493  accuracy:  1.0  Time cost:  12.608394145965576
Epoch:  1419  loss:  0.22179938219487666  accuracy:  1.0  Time cost:  12.714116334915161
Epoch:  1420  loss:  0.22051727958023548  accuracy:  1.0  Time cost:  12.638622283935547
Epoch:  1421  loss:  0.21928985081613064  accuracy:  1.0  Time cost:  12.676924467086792
Epoch:  1422  loss:  0.21810617335140706  accuracy:  1.0  Time cost:  12.7319016456604
Epoch:  1423  loss:  0.21



# Test the Model

In [51]:
def generate_model():
    batch_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    batch_salient_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    sent_state = sent_LSTM.zero_state(batch_size=1, dtype=tf.float32)
    paragraph = []
    for i in range(n_image):
        feats = batch_feats[:,i:i+1,:]
        salient_feats = batch_salient_feats[:,i:i+1,:]
        
        feats = tf.reshape(feats, [1, -1])
        salient_feats = tf.reshape(salient_feats, [1, -1])
        
        with tf.variable_scope('dense'):
            project_vec1 = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
            project_vec2 = tf.nn.relu(tf.matmul(salient_feats, regionPooling_W2) + regionPooling_b2)
            #project_vec = tf.nn.tanh(tf.matmul(feats, regionPooling_W) + regionPooling_b)
            project_vec = tf.concat([project_vec1, project_vec2], 1)
        
        with tf.variable_scope('image_context'):
            sent_output, sent_state = sent_LSTM(project_vec, sent_state)
        with tf.name_scope('fc1'):
            hidden1 = tf.nn.relu( tf.matmul(sent_output, fc1_W) + fc1_b)
        with tf.name_scope('fc2'):
            sent_topic_vec = tf.nn.relu( tf.matmul(hidden1, fc2_W) + fc2_b)


        state = tf.nn.rnn_cell.LSTMStateTuple(sent_topic_vec[:, 0:512], sent_topic_vec[:, 512:])
        
        word_state = (state, state)
        
        current_embed = tf.nn.embedding_lookup(W_embeddings, tf.zeros([1], dtype=tf.int64))
        word_output, init_word_state = word_LSTM_cell(current_embed, word_state)
        logit_words = tf.nn.xw_plus_b(word_output, embed_word_W, embed_word_b)
        #flatten_logit_words = tf.reshape(logit_words, [-1])
    
        prediction = tf.nn.softmax(logit_words)
        flatten_prediction = tf.reshape(prediction, [-1])
    
        top_k_index = tf.nn.top_k(flatten_prediction, beam_width)
    
        k_sequences = []
        normalize_tensor = tf.constant(10, dtype=tf.float32)
        
        for i in range(beam_width):
            candidate = [[top_k_index[1][i]], tf.log(top_k_index[0][i])]
            k_sequences.append(candidate)
    
        generated_sent = []

        for j in range(1, N_max_word):
            if j > 0:
                tf.get_variable_scope().reuse_variables()
            
            temp_sequences = []
            candidate_words_list = []
            candidate_prob_list = []
            
            for sequence in k_sequences:
                prior_words = sequence[0]
                prior_probability = sequence[1]
                k = 0

                for word_index in prior_words:
                
                    current_embed = tf.nn.embedding_lookup(W_embeddings, word_index)
                    current_embed = tf.expand_dims(current_embed, 0)
                    if k == 0:
                        word_output, word_state = word_LSTM_cell(current_embed, init_word_state)
                        k = 1
                    else:
                        word_output, word_state = word_LSTM_cell(current_embed, word_state)
            
                logit_words = tf.nn.xw_plus_b(word_output, embed_word_W, embed_word_b)
                #flatten_logit_words = tf.reshape(logit_words, [-1])
            
                prediction = tf.nn.softmax(logit_words)
                flatten_prediction = tf.reshape(prediction, [-1])
            
                top_k_index = tf.nn.top_k(flatten_prediction, beam_width)
            
            
                for i in range(beam_width):
                    candidate_words = []
                    for word_index in prior_words:
                        candidate_words.append(word_index)
                
                    candidate_words.append(top_k_index[1][i])
                    candidate_prob = tf.add(prior_probability, tf.log(top_k_index[0][i]))
                
                    candidate_words_list.append(candidate_words)
                    candidate_prob_list.append(candidate_prob)
                
        
        
            top_k_prob_index = tf.nn.top_k(candidate_prob_list, beam_width)
            sorted_sequences = []
        
            for i in range(beam_width):
                word_list = tf.gather(candidate_words_list, top_k_prob_index[1][i])
                prob = tf.gather(candidate_prob_list, top_k_prob_index[1][i])
                word_list = tf.unstack(word_list)
                sorted_sequences.append([word_list, prob])
        
            k_sequences = sorted_sequences
        
        for s_sequence in k_sequences:
            paragraph.append(s_sequence)   
        #paragraph.append(generated_sent)

    return batch_feats, batch_salient_feats, paragraph
    

In [70]:
def generate_model():
    batch_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    batch_salient_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    sent_state = sent_LSTM.zero_state(batch_size=1, dtype=tf.float32)
    
    paragraph = []
    for i in range(n_image):
        feats = batch_feats[:,i:i+1,:]
        salient_feats = batch_salient_feats[:,i:i+1,:]
        
        feats = tf.reshape(feats, [1, -1])
        salient_feats = tf.reshape(salient_feats, [1, -1])
        
        with tf.variable_scope('dense'):
            project_vec1 = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
            project_vec2 = tf.nn.relu(tf.matmul(salient_feats, regionPooling_W2) + regionPooling_b2)
            project_vec = tf.concat([project_vec1, project_vec2], 1)
        
        with tf.variable_scope('image_context'):
            sent_output, sent_state = sent_LSTM(project_vec, sent_state)
        with tf.name_scope('fc1'):
            hidden1 = tf.nn.relu( tf.matmul(sent_output, fc1_W) + fc1_b)
        with tf.name_scope('fc2'):
            sent_topic_vec = tf.nn.relu( tf.matmul(hidden1, fc2_W) + fc2_b)

        state = tf.nn.rnn_cell.LSTMStateTuple(sent_topic_vec[:, 0:512], sent_topic_vec[:, 512:])
        word_state = (state, state)

        #image_embedding = tf.matmul(feats[i:i+1, :], img_embedding) + img_embedding_bias

        #word_output, word_state = word_LSTM_cell(image_embedding, word_state)
        #current_embed = tf.nn.embedding_lookup(W_embeddings, tf.zeros([1], dtype=tf.int64))
        generated_sent = []

        for j in range(0, N_max_word):
            if j>0:
                tf.get_variable_scope().reuse_variables()
            if j == 0:
                current_embed = tf.nn.embedding_lookup(W_embeddings, tf.zeros([1], dtype=tf.int64))
            with tf.variable_scope('word'):
                word_output, word_state = word_LSTM_cell(current_embed, word_state)

            logit_words = tf.nn.xw_plus_b(word_output, embed_word_W, embed_word_b)
            #prediction = tf.nn.softmax(logit_words)
            max_prob_index = tf.argmax(logit_words, 1)[0]

            generated_sent.append(max_prob_index)
            with tf.device('/cpu:0'):
                current_embed = tf.nn.embedding_lookup(W_embeddings, max_prob_index)
                current_embed = tf.expand_dims(current_embed, 0)
        
        paragraph.append(generated_sent)

    return batch_feats, batch_salient_feats, paragraph

In [56]:
def generate_perplexity_model():
    batch_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    batch_salient_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    batch_caption = tf.placeholder(tf.int32, [1, n_image, N_max_word + 1])
    
    sent_state = sent_LSTM.zero_state(batch_size=1, dtype=tf.float32)
    
    paragraph = []
    paragraph_probabilities = []
    
    for i in range(n_image):
        feats = batch_feats[:,i:i+1,:]
        salient_feats = batch_salient_feats[:,i:i+1,:]
        
        feats = tf.reshape(feats, [1, -1])
        salient_feats = tf.reshape(salient_feats, [1, -1])
        
        caption = batch_caption[:,i:i+1,:]
        caption = tf.reshape(caption, [1, -1])
        
        with tf.variable_scope('dense'):
            project_vec1 = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
            project_vec2 = tf.nn.relu(tf.matmul(salient_feats, regionPooling_W2) + regionPooling_b2)
            project_vec = tf.concat([project_vec1, project_vec2], 1)
        
        with tf.variable_scope('image_context'):
            sent_output, sent_state = sent_LSTM(project_vec, sent_state)
        with tf.name_scope('fc1'):
            hidden1 = tf.nn.relu( tf.matmul(sent_output, fc1_W) + fc1_b)
        with tf.name_scope('fc2'):
            sent_topic_vec = tf.nn.relu( tf.matmul(hidden1, fc2_W) + fc2_b)

        state = tf.nn.rnn_cell.LSTMStateTuple(sent_topic_vec[:, 0:512], sent_topic_vec[:, 512:])
        word_state = (state, state)

        #image_embedding = tf.matmul(feats[i:i+1, :], img_embedding) + img_embedding_bias

        #word_output, word_state = word_LSTM_cell(image_embedding, word_state)
        #current_embed = tf.nn.embedding_lookup(W_embeddings, tf.zeros([1], dtype=tf.int64))
        generated_sent = []
        probabilities = []
        for j in range(0, N_max_word):
            if j>0:
                tf.get_variable_scope().reuse_variables()
            if j == 0:
                current_embed = tf.nn.embedding_lookup(W_embeddings, caption[:,j])
            with tf.variable_scope('word'):
                word_output, word_state = word_LSTM_cell(current_embed, word_state)

            logit_words = tf.nn.xw_plus_b(word_output, embed_word_W, embed_word_b)
            prediction = tf.nn.softmax(logit_words)
            max_prob_index = tf.argmax(prediction, 1)[0]

            generated_sent.append(max_prob_index)
            probabilities.append(prediction)
            with tf.device('/cpu:0'):
                current_embed = tf.nn.embedding_lookup(W_embeddings, caption[:,j])
                #current_embed = tf.expand_dims(current_embed, 0)
        
        paragraph.append(generated_sent)
        paragraph_probabilities.append(probabilities)

    return batch_feats, batch_salient_feats, batch_caption, paragraph, paragraph_probabilities

# Test Data

In [57]:
def perplexity(descriptions, probabilities):
    
    idx = 0
    story_probability = 0
    N = 0
    for description in descriptions:
        caption_matrix = get_caption_matrix(description)
        j = 0
        for word_id in caption_matrix[1:]:
            probability_arr = probabilities[idx][j][0]
            probability = probability_arr[word_id]
            j +=1
            
            story_probability += math.log(probability,2)
            N +=1
        
        idx +=1
        
    story_probability /= N
    perplexity = math.pow(2,-story_probability)
    print("perplexity",perplexity)
    return perplexity

In [58]:
import random
def get_randomtext():
    sent = ""
    for i in range(N_max_word):
        word = random.randint(4,vocab_size-1)
        text = idx2word[word]
        sent = sent + text + " "
    sent = sent.strip()
    return sent

In [59]:
def format_sentance(generated_sentence):
        each_sent = []
        for sent in generated_sentence:
            for word_indices in sent:
                #print(word_indices)
                #print(word_indices)
                #for word_index in word_indices:
                    #print(idx2word[word_index])
                each_sent.append(idx2word[word_indices])
            
        current_sent = ''
        for each_word in each_sent:
            current_sent += each_word + ' '
        
        current_sent = current_sent.replace('<eos> ', '')
        current_sent = current_sent.replace('<pad> ', '')
        current_sent = current_sent + '.'
        current_sent = current_sent.replace(' .', '.')
        current_sent = current_sent.replace(' ,', ',')
        return current_sent
    

In [61]:

def test():
    #tf_feats, tf_generated_sent, tf_k_top = generate_model()
    start_time = time.time()

    model_path = '../model_batch_v2/base_model_v2-1500'

    #tf_feats, tf_salient_feats, tf_generated_sent = generate_model()
    tf_feats, tf_salient_feats, tf_caption, tf_generated_sent, tf_generated_probabilities = generate_perplexity_model()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    
    test_fd = open('random_Base_HRNN_results_1500.txt', 'w')
    output_sentences = dict()
    part = 0
    for key, value in train_story_features.items():
        if part == 50:
            break
        part +=1
        features = []
        salient_features = []
        photo_ids = []
        descriptions = []
        caption_matrix = []
        for i in range(n_image):
            #description = value[i]['description']
            description = get_randomtext()
            org_features = value[i]['features']
            org_features = org_features.flatten()
            
            photo_id = value[i]['photo_id']
            
            #_salient_features = salient_validation_image_features[photo_id]
            _salient_features = train_salient_image_features[photo_id]
            _salient_features = _salient_features.flatten()
            
            _caption_matrix = get_caption_matrix(description)
            #_caption_matrix = get_randomtext()
            
            #_features = np.concatenate([org_features, _salient_features])
            features.append(org_features)
            salient_features.append(_salient_features)
            caption_matrix.append(_caption_matrix)
            
            photo_ids.append(photo_id)
            descriptions.append(description)
            
        features = np.array(features)
        features = np.asarray(np.reshape(features, (1, n_image, -1)))
        salient_features = np.array(salient_features)
        salient_features = np.asarray(np.reshape(salient_features, (1, n_image, -1)))
        caption_matrix = np.array(caption_matrix)
        caption_matrix = np.asarray(np.reshape(caption_matrix, (1, n_image, -1)))
        
        test_fd.write("story: " + key + '\n')
        test_fd.write("photo_ids: ")
        for ids in photo_ids:
            test_fd.write(ids + " ")
        test_fd.write("\n")
            
        
        each_paragraph = []
        current_paragraph = ""
        
        #generated_sentence_indexes = sess.run([tf_generated_sent], feed_dict={
        #    tf_feats: features,
        #    tf_salient_feats: salient_features
        #})
        
        generated_sentence_indexes, probabilites = sess.run([tf_generated_sent, tf_generated_probabilities], feed_dict={
            tf_feats: features,
            tf_salient_feats: salient_features,
            tf_caption: caption_matrix
        })
            
        
        for sentence_list in generated_sentence_indexes:
            each_sent = []
            for word_index in sentence_list:
                each_sent.append(idx2word[word_index])
            each_paragraph.append(each_sent)
            
        current_sent = ''
        for each_sent in each_paragraph:
            for each_word in each_sent:
                current_sent += each_word + ' '
                
            current_sent = current_sent.replace('<eos> ', '')
            current_sent = current_sent.replace('<pad> ', '')
            current_sent = current_sent + '.'
            current_sent = current_sent.replace(' .', '.')
            current_sent = current_sent.replace(' ,', ',')
            current_sent += '\n'
            
            
        test_fd.write(" generated text: " + current_sent + '\n')
        test_fd.write(" original text: " + '\n')
        for description in descriptions:
            test_fd.write(description + '\n')
        test_fd.write("\n")
        per = perplexity(descriptions, probabilites)
        test_fd.write("perplexity: " + str(per) +'\n')
        test_fd.write("\n")
        
        print(current_sent)
        print(descriptions)
        output_sentences[key] = current_sent
        
    test_fd.close()
    print ("Time cost: " + str(time.time()-start_time))
    return output_sentences

In [62]:
output = test()

INFO:tensorflow:Restoring parameters from ../model_batch_v2/base_model_v2-1500
perplexity 82991671.47123319
our landmark in in wa wa about to to to the in in a a <unk> <unk>.
so we decided to to to to to to the the the the day to to to.
to see the the of of the a a a the the <unk> male i are.
and it magnificent a a a a life day day.
last picture of it beauty beauty could could capture it forever sky i <unk>.

['causing impressed smaller known running spotted graffiti festival asian detail bigger between putt back elaborate set knowing town happened heading organization viewing celebration reminded itself myself curious fortune built castle', 'while marathon split ocean couple party sushi laughed returned tradition goodbye goofing reception course feast laying your flame image visiting step brought shooting congratulate display waiting pleased naked sudden daughter', 'littlest cry water interested asian bad changing photographer morning excellent gift strip court once loving volunteer w

perplexity 1364262970.4413407
the the our a a <unk> <unk> <unk> <unk> <unk> <unk> <unk>.
our year a a a a baby is <unk> male <unk>.
we rented a a to to to to <unk> <unk> <unk> to to to.
our <unk> daughter did to to to a to <unk> <unk>.
the loving the the life in u u <unk> of of the <unk> <unk>.

['melt full plane fan sell stuffed setting # promise heading patron king stand god trek donation swing night rpg snapped successful congratulate gang dive written chess mean vegetable wanted figured', 'newly charm stronger sporting pole wake community father seems challenge mandate entertainment so style colorful skyline tram rodeo genius construction gear probably heritage edge something people stop produce very lip', 'full hillside finishing furnished chair purple change aisle path girlfriend checked got during heading lose daddy decorating guest receive blast girl club curious public fell serving thanked spring laughed cake', 'haunted couple perfectly arrives father cloudy shiny available pi

perplexity 159784250.05101034
the the a to to to the <unk> <unk> <unk> <unk>.
first came the the the the the the the the the the <unk> <unk> <unk>.
then came the the the the the the the the the went to <unk>.
some of a the the member on there on on make their <unk> <unk>.
next in line came flag team the.

['gon hearing either firework vendor cow wild candle single gut involved screen jack cookout deep slow ear helped cause test tea magnificent between boardwalk normal breakfast fear spot love proud', 'look talking pit volleyball took one flame somewhere flying fabulous confessed route call groom in done feel unlike reminded will dead spy backyard you underway made rooting fund sign fish', 'split stuck ask laugh snowdon previous happening yellow course panel unlike drove pond behold tip seen entertainment into done naked experience toured comfort toast when absolutely ordinary who skilled root', 'lady spends grandmother boot fascinating past alright entranced school unique toast eating 

perplexity 156236260.2287305
the the the of the <unk> a <unk> <unk>.
a the the a on they bigger sky flower it they.
the the the more more <unk> park good park to.
soon multiple firework were launched to create <unk> <unk> the <unk> to to to.
the the the the the the massive of firework the <unk> <unk> <unk> to <unk>.

['ever topic asian subway chica hang trying shuttle number wired worked feed real float price peace artisan batmitzva student obstacle fascinating stop quick use ride putting rested woke personal smile', 'disappoint bride cute called cemetery opened initial flying instruction capital table museum kiki crazy happier entertain pole to pathway entrance celebrated might mission patch statue breathtaking fellow next newly challenge', 'tourist sound happening program energy juggler round day being wear picked happier remember opponent playing because coworkers worth square anyone my sale gasp gloomy president makeup opened nearly strip food', 'wildlife horse spends road fabulous

perplexity 413107301.5286729
it wa wa the a a a a a a begin <unk> <unk>.
patron wa very very the the the the the <unk> <unk> a.
female wa <unk> of male after after had too many many beverage the the.
there wa something in the the the people their to their to their <unk> <unk> <unk>.
the played a a a a of of with d' d' out go the the.

['kayak more safely earned alive already slow see river making choice knowing vehicle slowly song desert fish co wonder morning jumped sitting moving easy vibrant grandmother perfectly vibrant riding role', 'messing warming themed heart political break star cut catch chalk slowly sunset again rollercoaster feed luc day police sort gloomy weird & others birthday wonderful slice maintenance centerpiece stolen ruin', 'artist strange hot base weather bug mother reading comfort bicycle coin want cheesecake gloomy skiing mad surrounding nephew with helicopter shot trooper skiing entrance tank remain felt refreshing opportunity manager', 'stuffed between real mi

perplexity 504628951.872855
i decided to to with my my my the the the <unk> <unk> <unk> <unk> to to to <unk>.
we stopped at a a to to to <unk> <unk> <unk> <unk> to to to.
across the a a a a a a a time to to night.
when we arrived a a a a with with chicken for <unk> <unk> <unk> to to.
a firework show could in in in the at at.

['amazed became meet older bubble woman autograph site health dock attended above stand third street leading lady funnel inviting nut covering prison fellow sort monkey written animal pretty rollercoaster right', 'begun preparing see happened loudly select mitzvah courtyard bag pull any telling shot grandpa dog indoor mimic themselves vegetable giant okay and board laid agenda scene first polar sparkling fascinating', 'kind beyond sadly square friday think alright japanese spy businessman portion train wild zoo wedding speak cruise sparkling expect should jeep instantly thing helping kids performing historic nut dive going', 'maintenance picnic chess enjoyment tog

perplexity 1126264426.2255316
we had family over for <unk> <unk> <unk> the the the <unk>.
the colored with <unk> <unk> <unk> <unk> <unk>.
some of of the the their the <unk>.
i i of into into into into a a a a.
here is what i really a a a a the the <unk> <unk> <unk> <unk> <unk>.

['boss chair opening time screaming grilled season round blue cotton afar her wa camping badly congratulated massive silly dinner neighborhood race wa concert passing fund standing fellow painting funnel longer', 'patriot birthday wa show money ear realized intricate exchanged lay happening photograph structure welcome aunt speech young fast warmth stop blast time cupcake staying chinese rounded at goer charm journey', 'sell tradition leader stadium known photograph traveling scared which pizza neighbor funnel badge thin opera scenery point writer nighttime actor kiss helicopter cloth fancy excited ceo awe catch green riding', 'male viewed see music nursery inspiring anyways expensive grilled attending creek ce

perplexity 819944245.617853
i went to the the the a a a a a a to at i i i i.
mom and and we we we we we we we we talk they.
i walked into an an <unk> to to a a their their.
my said my my my <unk> in my wa wa want to to to.
i did n't a because wa so so so wa wa so wearing for.

['path wow patriot cream earlier heat gon tasty contestant series year cute pumpkin type dry enjoys warming finale smoke exciting nearby musician want magnificent taking sporting landed finish mommy although', 'blew recognized police cause wear army learned equipment taking toy panel ancient popular talked because colette done wait snow squirrel rally pagoda spotted japan singer vampire red song temple hotel', 'array hot problem pretty shape topped car world rally decoration woke darker money luckily receives skyline however hill independence always competition eagerly zoo danger happy hotel man skyline guess wedding', 'you porch hard taking carnival motorcycle mascot moving funny anyway llama definitely explodin

In [46]:
output

{'40470': "the old car wa great design \n.we took the new one together to dinner and wa out \n.but said what wa going to be the next thing <unk> i could n't believe them and did \n.the food wa out of food and looked for food \n.after the food wa fun <unk> it wa time to go and me the local food to have lunch next day \n.",
 '40471': 'today wa an annual birthday at the annual bar bar \n.they were so excited for this new new new new cake \n.this wa a nice view he wa about this old time <unk> he had many interesting on his car \n.i had to take this picture in this for picture today <unk> i had to <unk> \n.in the end to the beach <unk> what it all day \n.',
 '40472': 'today wa an annual birthday at the annual bar bar \n.they were so excited for this new new new new cake \n.this wa a nice view he wa about this old time <unk> he had many interesting on his car \n.i had to take this picture in this for picture today <unk> i had to <unk> \n.in the end to the beach <unk> what it all day \n.',
 '

In [97]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /u/spa-d2/grad/mna245/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True