In [1]:
import time
import os
import tensorflow as tf
import json
import pickle
import copy
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
import nltk
from nltk.corpus import wordnet
from nltk.corpus import words
from spellchecker import SpellChecker
from nltk.corpus import stopwords 
import math

In [3]:
train_story_seq = json.load(open('../train.story-in-sequence.json'))

In [4]:
train_description_seq = json.load(open('../train.description-in-isolation.json'))

In [5]:
train_image_features = pickle.load(open('../img_feats_train_key_value_all_fc2', 'rb'))

In [6]:
def get_story(annotations):
    story = dict()
    annotations_len = len(annotations)
    previous_story_id = annotations[0][0]['story_id']
    story[previous_story_id] = list()
    for i in range(annotations_len):
        for j in range(len(annotations[i])):
            if previous_story_id != annotations[i][j]['story_id']:
                previous_story_id = annotations[i][j]['story_id']
                story[previous_story_id] = list()
                
            image_des = {'photo_id':annotations[i][j]['photo_flickr_id'], 
                         'description' :annotations[i][j]['text']}
            story[previous_story_id].append(image_des)
            
    return story

In [7]:
story = get_story(train_story_seq['annotations'])

In [8]:
def get_standalone_description(annotations):
    descriptions = dict()
    for items in annotations:
        for item in items:
            descriptions[item['photo_flickr_id']] = item['original_text']
    return descriptions

In [9]:
standalone_description = get_standalone_description(train_description_seq['annotations'])

In [10]:
english_dictionary = dict.fromkeys(words.words(), None)
g_spellChecker = SpellChecker()

In [11]:
def is_english_word(word):
    try:
        x = english_dictionary[word]
        return True
    except KeyError:
        return False

In [12]:
def preprocess_story_sentence(sent):
    #print("sent",sent)
    sent = sent.lower()
    sent = sent.strip()
    sent = sent.replace(',', '')
    sent = sent.replace('.', '')
    sent = sent.replace('"', '')
    sent = sent.replace('[', '')
    sent = sent.replace(']', '')
    sent = sent.replace('?', '')
    sent = sent.replace('!', '')
    sent = sent.replace(':', '')
    sent = sent.replace(';', '')
    sent = sent.replace('-', ' ')
    sent = sent.strip()
    
    tokens = sent.split(" ")
    #print("token", tokens)
    words = []
    for token in tokens:
        syn = wordnet.synsets(token)
        if is_english_word(token) or len(syn) > 0:
            words.append(token)
        elif token != "":
            c_token = g_spellChecker.correction(token)
            words.append(c_token)
            #c_syn = wordnet.synsets(c_token)
            #if is_english_word(c_token) or len(c_syn) > 0:
            #    words.append(c_token)
    
    new_sent = ""
    for word in words:
        new_sent = new_sent + " "+ word
    
    new_sent = new_sent.strip()
    return new_sent


In [13]:
def preprocess_standalone_description(description):
    print(description)
    new_sent = preprocess_story_sentence(description)
    words = new_sent.split(" ")
    stop_words = set(stopwords.words('english')) 
    filtered_sentence = [w for w in words if not w in stop_words]
    preprocessed_sent = ""
    for word in filtered_sentence:
        preprocessed_sent = preprocessed_sent + " " + word
    preprocessed_sent.strip()
    print("--",preprocessed_sent)
    return preprocessed_sent

In [14]:
def story_image_features(story_data, description_data, image_features):
    image_features_len = len(image_features)
    story_features = dict()
    idx = 0
    for key, value in story_data.items():
        print(idx)
        idx +=1
        
        exist = True
        for i in range(len(value)):
            photo_id = value[i]['photo_id']
            if photo_id in image_features and photo_id in description_data:
                value[i]['features'] = image_features[photo_id]
                value[i]['description'] = preprocess_story_sentence(value[i]['description'])
                value[i]['stndalone_description'] = preprocess_standalone_description(description_data[photo_id])
            else:
                exist = False
                break
        if exist == True:
            story_features[key] = value
            
    return story_features

In [15]:
#copy_story = copy.deepcopy(story)
#train_data = story_image_features(copy_story, standalone_description, train_image_features)

In [16]:
#pickle.dump(train_data, open("train_data", 'wb'))

In [17]:
train_data = pickle.load(open('train_data', 'rb'))

In [18]:
story

{'30355': [{'description': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
   'photo_id': '2627795780'},
  {'description': 'so we decided to take the day to go out and enjoy its beauty .',
   'photo_id': '2626979987'},
  {'description': 'to see the final glimpse of the roots , extending out into the depths of the hill .',
   'photo_id': '2626982337'},
  {'description': 'and its magnificent trunk , larger than life itself .',
   'photo_id': '2626983575'},
  {'description': 'one last picture of its beauty so we could capture it forever .',
   'photo_id': '2626985925'}],
 '30356': [{'description': 'we found this tree when we were walking in a nearby town .',
   'photo_id': '2701863545'},
  {'description': 'it turns out it is a popular attraction here .',
   'photo_id': '2626977325'},
  {'description': 'the tree is very unusual , with its roots exposed .',
   'photo_id': '2627795780'},
  {'description': 'the trunk was really wide , as much as 12 feet !',

In [19]:
train_data

{'30356': [{'description': 'we found this tree when we were walking in a nearby town',
   'features': array([[1.1157882, 0.       , 0.       , ..., 0.       , 0.       ,
           0.       ]], dtype=float32),
   'photo_id': '2701863545',
   'stndalone_description': ' tree long dated branches'},
  {'description': 'it turns out it is a popular attraction here',
   'features': array([[0.90474606, 1.5048395 , 0.        , ..., 0.        , 1.9958305 ,
           0.        ]], dtype=float32),
   'photo_id': '2626977325',
   'stndalone_description': ' plaque stand surround died leaves tree'},
  {'description': 'the tree is very unusual a with its roots exposed',
   'features': array([[0.        , 0.        , 0.89841056, ..., 0.        , 0.        ,
           0.        ]], dtype=float32),
   'photo_id': '2627795780',
   'stndalone_description': ' huge tree sits outside several large roots stemming trunk'},
  {'description': 'the trunk was really wide a as much as 12 feet',
   'features': arra

# Creating train and validation data

In [49]:
import itertools
n = len(train_data)//1
i = iter(train_data.items())

In [50]:
splitted_data = dict(itertools.islice(i,n))

In [51]:
train_data_len = (5*len(splitted_data))//6

In [52]:
start = iter(splitted_data.items())

In [53]:
splitted_train_data = dict(itertools.islice(start, train_data_len))

In [54]:
len(splitted_train_data)

22420

In [55]:
splitted_validation_data = dict(start)

In [56]:
len(splitted_validation_data)

4485

In [68]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=3):
    # borrowed this function from NeuralTalk
    print ('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ))

    word_counts = {}
    nsents = 0

    for sent in sentence_iterator:
        nsents += 1
        tmp_sent = sent.lower().split(' ')
        if '' in tmp_sent:
            tmp_sent.remove('')
        
        # bigram
        #for w1,w2 in bigrams(tmp_sent):
        #    w = w1 + " "+ w2
        #    if w !='':
        #        word_counts[w] = word_counts.get(w, 0) + 1

        for w in tmp_sent:
            if w !='':
                word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print ('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    #ixtoword = {}
    #ixtoword[0] = '<bos>'
    #ixtoword[1] = '<eos>'
    #ixtoword[2] = '<pad>'
    #ixtoword[3] = '<unk>'

    #wordtoix = {}
    #wordtoix['<bos>'] = 0
    #wordtoix['<eos>'] = 1
    #wordtoix['<pad>'] = 2
    #wordtoix['<unk>'] = 3

    #for idx, w in enumerate(vocab):
    #    wordtoix[w] = idx + 4
    #    ixtoword[idx+4] = w

    #word_counts['<eos>'] = nsents
    #word_counts['<bos>'] = nsents
    #word_counts['<pad>'] = nsents
    #word_counts['<unk>'] = nsents
    ixtoword = {}
    ixtoword[0] = 'bos'
    ixtoword[1] = 'eos'
    ixtoword[2] = 'pad'
    ixtoword[3] = 'unk'

    wordtoix = {}
    wordtoix['bos'] = 0
    wordtoix['eos'] = 1
    wordtoix['pad'] = 2
    wordtoix['unk'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx + 4
        ixtoword[idx+4] = w

    word_counts['eos'] = nsents
    word_counts['bos'] = nsents
    word_counts['pad'] = nsents
    word_counts['unk'] = nsents

    return wordtoix, ixtoword

In [69]:
def get_all_sentences(data):
    all_sentences = []
    for key, value in data.items():
        length = len(value)
        for i in range(length):
            sent = value[i]['description']
            #sent = preprocess_sentence(sent)
            all_sentences.append(sent)
            
    return all_sentences


In [70]:
sentences = get_all_sentences(splitted_data)

In [71]:
sentences

['we found this tree when we were walking in a nearby town',
 'it turns out it is a popular attraction here',
 'the tree is very unusual a with its roots exposed',
 'the trunk was really wide a as much as 12 feet',
 'you can see how big these roots are a a pretty amazing',
 'they went to the botanic gardens specifically to see the large tree',
 'there was an informational sign posted near it',
 'the roots were huge and spread out over the ground',
 'the trunk was incredibly thick and rigid',
 'the large roots were almost as thick as the trunk',
 'we went to see the largest tree in the country',
 'the plack under it had some interesting information',
 'there was a good view of the water from the tree as well',
 'i was dwarfed by the tree is size',
 'in the end we went over to the visitor center',
 'our trip to location last year was filled with beauty a sculptures could be found everywhere we went',
 'the architecture was old and interesting',
 'since it was spring a beautiful flowers l

In [72]:
word2idx, idx2word = preProBuildWordVocab(sentences, 2)

preprocessing word counts and creating vocab based on word count threshold 2
filtered words from 20927 to 13608


In [73]:
train_data

{'30356': [{'description': 'we found this tree when we were walking in a nearby town',
   'features': array([[1.1157882, 0.       , 0.       , ..., 0.       , 0.       ,
           0.       ]], dtype=float32),
   'photo_id': '2701863545',
   'stndalone_description': ' tree long dated branches'},
  {'description': 'it turns out it is a popular attraction here',
   'features': array([[0.90474606, 1.5048395 , 0.        , ..., 0.        , 1.9958305 ,
           0.        ]], dtype=float32),
   'photo_id': '2626977325',
   'stndalone_description': ' plaque stand surround died leaves tree'},
  {'description': 'the tree is very unusual a with its roots exposed',
   'features': array([[0.        , 0.        , 0.89841056, ..., 0.        , 0.        ,
           0.        ]], dtype=float32),
   'photo_id': '2627795780',
   'stndalone_description': ' huge tree sits outside several large roots stemming trunk'},
  {'description': 'the trunk was really wide a as much as 12 feet',
   'features': arra

# Load Pretrained Glove Word embedding

In [74]:
def load_pretrained_word_vectors(word_embedding_filepath, word_dict):
    count = 0
    glove_vocab = []
    glove_embd=[]
    embedding_dict = {}
    file = open(word_embedding_filepath,'r',encoding='UTF-8')
    embeddings_tmp=[]
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab_word = row[0]
        glove_vocab.append(vocab_word)
        embed_vector = [float(i) for i in row[1:]] # convert to list of float
        embedding_dict[vocab_word]=embed_vector
    file.close()
    embedding_dim = len(embed_vector)
    for key in word_dict:
        if key in glove_vocab:
            embeddings_tmp.append(embedding_dict[key])
        else:
            print(key)
            count += 1
            rand_num = np.random.uniform(low=-0.2, high=0.2,size=embedding_dim)
            embeddings_tmp.append(rand_num)
    embedding = np.asarray(embeddings_tmp)
    print(count)
    return embedding

In [75]:
word_embedding = load_pretrained_word_vectors("glove.6B.300d.txt", word2idx)

granpa
grandads
preforming
firelit
cheesing
carousal
ingvold
werent
barbequing
r/2
150000
sunsetting
bicylists
bicycler
piãƒæ’ã‚â±ata
techsmith
photobomb
bicyclers
knelle
photobombing
quinceneria
hashbrowns
103e9
gramgram
tree-house
took-off
waled
4541
200000
snipper
arachnophobes
heeeyyyy
buggys
cafãƒæ’ã‚â©
swiming
scenary
sautãƒæ’ã‚â©ed
ziplining
trolly
snowglobes
photobombed
ãƒâ¢ã¢â€šâ¬ã¢â‚¬å“
wabadodo
raybans
18004
portapotties
wifes
5995
3995
aargh
buffest
rotties
joyed
unicyclists
endlifecom
mackembe
grumpiest
bellringers
58


In [76]:
per_image_vector_dimension = 4096
feature_dimension = per_image_vector_dimension
#state_size = 512
n_words = len(word2idx)
wordRNN_lstm_dim = 512
word_embed_dim = 300
batch_size = 24
learning_rate = 0.01
n_epochs = 1001
#project_dim = 1024
image_fc_dim = 1024
sentRNN_lstm_dim = 1024
#sentRNN_FC_dim = 1024
N_max_word = 30
n_image = 5
beam_width = 10

description_lstm_dim = 512
img_lstm_dim = 300
vocab_size = len(word2idx)

In [77]:
def get_caption_matrix(img_sent):
    img_captions_matrix = np.ones([ N_max_word+1], dtype=np.int32) * 1
    img_captions_matrix[0] = 0
    #img_sent = preprocess_sentence(img_sent)
    idx = 1
    for k, word in enumerate(img_sent.lower().split(' ')):
        if idx == (N_max_word + 1):
            break
        if word in word2idx:
            img_captions_matrix[ idx] = word2idx[word]
        else:
            img_captions_matrix[ idx] = word2idx['unk']
        idx +=1
    if idx < (N_max_word +1):
        img_captions_matrix[ idx] = 1
        idx += 1
    if idx < (N_max_word +1):
        for i in range(idx, N_max_word +1):
            img_captions_matrix[i] = 2
    
    return img_captions_matrix

In [78]:
def get_caption_mask(caption_matrix):
    captions_matrix_mask = np.zeros([ N_max_word+1], dtype=np.int32)
    for i in range(N_max_word + 1):
        if caption_matrix[i] != 2:
            captions_matrix_mask[i] = 1
            
    return captions_matrix_mask
    

In [79]:
def get_sequence_length(sent):
    sent = sent.strip()
    words = sent.lower().split(' ')
    return len(words) + 1

In [80]:
word2idx

{'bos': 0,
 'eos': 1,
 'pad': 2324,
 'unk': 3,
 'we': 4,
 'found': 5,
 'this': 6,
 'tree': 7,
 'when': 8,
 'were': 9,
 'walking': 10,
 'in': 11,
 'a': 12,
 'nearby': 13,
 'town': 14,
 'it': 15,
 'turns': 16,
 'out': 17,
 'is': 18,
 'popular': 19,
 'attraction': 20,
 'here': 21,
 'the': 22,
 'very': 23,
 'unusual': 24,
 'with': 25,
 'its': 26,
 'roots': 27,
 'exposed': 28,
 'trunk': 29,
 'was': 30,
 'really': 31,
 'wide': 32,
 'as': 33,
 'much': 34,
 '12': 35,
 'feet': 36,
 'you': 37,
 'can': 38,
 'see': 39,
 'how': 40,
 'big': 41,
 'these': 42,
 'are': 43,
 'pretty': 44,
 'amazing': 45,
 'they': 46,
 'went': 47,
 'to': 48,
 'gardens': 49,
 'specifically': 50,
 'large': 51,
 'there': 52,
 'an': 53,
 'informational': 54,
 'sign': 55,
 'posted': 56,
 'near': 57,
 'huge': 58,
 'and': 59,
 'spread': 60,
 'over': 61,
 'ground': 62,
 'incredibly': 63,
 'thick': 64,
 'rigid': 65,
 'almost': 66,
 'largest': 67,
 'country': 68,
 'under': 69,
 'had': 70,
 'some': 71,
 'interesting': 72,
 'informa

In [81]:
regionPooling_W1 = tf.Variable(tf.random_uniform([feature_dimension, image_fc_dim], -0.1, 0.1), name = 'regionPooling_w1')
regionPooling_b1 = tf.Variable(tf.zeros([image_fc_dim]))

#regionPooling_W2 = tf.Variable(tf.random_uniform([feature_dimension, image_fc_dim], -0.1, 0.1), name = 'regionPooling_w2')
#regionPooling_b2 = tf.Variable(tf.zeros([image_fc_dim]))

img_embedding = tf.Variable(tf.random_uniform([feature_dimension, wordRNN_lstm_dim], -0.1, 0.1), name = 'img_embedding')
img_embedding_bias = tf.Variable(tf.zeros([wordRNN_lstm_dim])) 

#fc1_W = tf.Variable(tf.random_uniform([sentRNN_lstm_dim, sentRNN_FC_dim], -0.1, 0.1), name = 'fc1_w')
#fc1_b = tf.Variable(tf.zeros(sentRNN_FC_dim), name = 'fc1_b')
#fc2_W = tf.Variable(tf.random_uniform([sentRNN_FC_dim, 1024], -0.1, 0.1), name = 'fc2_w')
#fc2_b = tf.Variable(tf.zeros(1024), name = 'fc2_b')

embed_word_W = tf.Variable(tf.random_uniform([wordRNN_lstm_dim, n_words], -0.1,0.1), name = 'embed_word_w')
embed_word_b = tf.Variable(tf.zeros([n_words]), name = 'embed_word_b')

#W_embeddings = tf.Variable(tf.random_uniform([n_words, word_embed_dim], -0.1, 0.1), name = 'wemb')
W_embeddings = tf.Variable(tf.constant(0.0, shape=[n_words, word_embed_dim]), trainable=False, name="wemb")

In [82]:
def lstm_cell(size):
    return tf.contrib.rnn.BasicLSTMCell(size, reuse=tf.AUTO_REUSE)

In [83]:
description_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell(description_lstm_dim) for _ in range(2)])

In [84]:
img_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell(img_lstm_dim) for _ in range(2)])

In [85]:
word_lstm= tf.contrib.rnn.MultiRNNCell([lstm_cell(wordRNN_lstm_dim) for _ in range(2)])

In [86]:
description_context_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell(wordRNN_lstm_dim) for _ in range(2)])


In [87]:
def build_pretrained_word_embeddings():
    embedding_placeholder = tf.placeholder(tf.float32, [n_words, word_embed_dim])
    embedding_init = W_embeddings.assign(embedding_placeholder)
    return embedding_init, embedding_placeholder

In [88]:
def build_model():
    loss_word = 0.0
    acc_sent = 0.0
    batch_feats = tf.placeholder(tf.float32, [batch_size, n_image, feature_dimension])
    batch_caption = tf.placeholder(tf.int32, [batch_size, n_image, N_max_word + 1])
    batch_description = tf.placeholder(tf.int32, [batch_size, n_image, N_max_word + 1])
    batch_sequence_length = tf.placeholder(tf.int32, [batch_size, n_image, 1])
    batch_captions_masks = tf.placeholder(tf.float32, [batch_size, n_image, N_max_word+1])
    #eos = tf.constant(2, dtype=tf.int32)
    desc_state = description_lstm.zero_state(batch_size=batch_size, dtype=tf.float32)
    img_state = img_lstm.zero_state(batch_size=batch_size, dtype=tf.float32)
    word_state = word_lstm.zero_state(batch_size=batch_size, dtype=tf.float32)
    
    for i in range(n_image):
        # topic RNN
        feats = batch_feats[:,i:i+1,:]
        caption = batch_caption[:,i:i+1,:]
        description = batch_description[:,i:i+1,:]
        sequence_length = batch_sequence_length[:,i:i+1,:]
        caption_masks = batch_captions_masks[:,i:i+1,:]
        
        feats = tf.reshape(feats, [batch_size, -1])
        caption = tf.reshape(caption, [batch_size, -1])
        description = tf.reshape(caption, [batch_size, -1])
        sequence_length = tf.reshape(sequence_length, [-1])
        caption_masks = tf.reshape(caption_masks, [batch_size, -1])
        
        description_embed = tf.nn.embedding_lookup(W_embeddings, description)
        
        with tf.variable_scope('description_context'):
            desc_outputs, desc_state = tf.nn.dynamic_rnn(description_lstm, inputs = description_embed, sequence_length= sequence_length,
                                                         initial_state=desc_state,
                                                        dtype=tf.float32)
        
        image_vec = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
        with tf.variable_scope('image_context'):
            img_output, img_state = img_lstm(image_vec, img_state)
        
        decoder_input = img_state[1].h
        with tf.variable_scope('description_encoder'):
            desc_encoder_output, desc_encoder_state = description_context_lstm(desc_state[1].h, word_state)
        
        state = tf.nn.rnn_cell.LSTMStateTuple(desc_encoder_state[1].h, desc_encoder_state[1].h)
        word_state = (state, state)
        acc_word = 0.0
        
        for j in range(0, N_max_word+1):
            if j > 0:
                tf.get_variable_scope().reuse_variables()                
            
            if j == 0:
                current_embed = decoder_input
            else:
                current_embed = tf.nn.embedding_lookup(W_embeddings, caption[:, j-1])
            
            with tf.variable_scope('word'):
                word_output, word_state = word_lstm(current_embed, word_state)
            indices = caption[:,j]
            
            onehot_labels = tf.one_hot(indices, depth = n_words)
            logit_words = tf.nn.xw_plus_b(word_output[:], embed_word_W, embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logit_words, labels = onehot_labels)
            cross_entropy = cross_entropy * caption_masks[:, j]

            prediction = tf.nn.softmax(logit_words)
            correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(onehot_labels, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            acc_word += accuracy
            loss_wordRNN = tf.reduce_sum(cross_entropy) / batch_size
            loss_word += loss_wordRNN
        
        acc_word = acc_word/(N_max_word+1)
        acc_sent += acc_word
    
    acc_sent /= n_image
    loss_word = loss_word/n_image
    return batch_feats, batch_caption, batch_captions_masks, batch_description, batch_sequence_length, loss_word, acc_sent


In [89]:
#build_model()

In [90]:
model_path = './models_batch/'

In [91]:
def train_model():
    tf_feats, tf_caption, tf_caption_masks, tf_description, tf_desc_seq_length, tf_loss, tf_acc = build_model()
    tf_embedding_init, tf_embedding_placeholder = build_pretrained_word_embeddings()
    sess = tf.InteractiveSession()
    
    saver = tf.train.Saver(max_to_keep=100, write_version=1)
    
    #pre_model_path = '../model_batch_v2/base_model_v2-1500'

    #saver.restore(sess, pre_model_path)
    
    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss)
    #train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    
    tf.global_variables_initializer().run()
    sess.run([tf_embedding_init], feed_dict={
                            tf_embedding_placeholder : word_embedding
                        })
    
    data_length = len(splitted_train_data)
    for epoch in range(0, n_epochs):
            #train_story_features_keys = list(train_story_features.items())
            #np.random.shuffle(train_story_features_keys)
            #shuffled_train_story_features = dict(train_story_features_keys)
        
            loss = 0;
            acc = 0;
            total_pass = 0
            start_time = time.time()
            n_item = 0
            features = []
            #salient_features = []
            caption_matrix = []
            standalone_caption_matrix = []
            standalone_caption_seq_length = []
            caption_masks = []
            passed_data = 0
            for key, value in splitted_train_data.items():
                if n_item < batch_size:
                    n_item += 1
                    for i in range(n_image):
                        description = value[i]['description']
                        standalone_description = value[i]['stndalone_description']
                        original_features = value[i]['features']
                        seq_length = get_sequence_length(standalone_description)
                        original_features = original_features.flatten()
                        
                        #_salient_features = train_salient_image_features[value[i]['photo_id']]
                        #_salient_features = _salient_features.flatten()
                        
                        #_features = np.concatenate([original_features, _salient_features])
                        
                        _caption_matrix = get_caption_matrix(description)
                        _caption_mask = get_caption_mask(_caption_matrix)
                        _standalone_description_matrix = get_caption_matrix(standalone_description)
                        features.append(original_features)
                        #salient_features.append(_salient_features)
                        caption_matrix.append(_caption_matrix)
                        standalone_caption_matrix.append(_standalone_description_matrix)
                        standalone_caption_seq_length.append(seq_length)
                        caption_masks.append(_caption_mask)
                    
                if n_item == batch_size:    
                    features = np.array(features)
                    #salient_features = np.array(salient_features)
                    caption_matrix = np.array(caption_matrix)
                    standalone_caption_matrix = np.array(standalone_caption_matrix)
                    standalone_caption_seq_length = np.array(standalone_caption_seq_length)
                    
                    features = np.asarray(np.reshape(features,(batch_size, n_image, -1)))
                    caption_masks = np.array(caption_masks)
                    #salient_features = np.asarray(np.reshape(salient_features,(batch_size, n_image, -1)))
                    
                    caption_matrix = np.asarray(np.reshape(caption_matrix, (batch_size, n_image, -1)))
                    standalone_caption_matrix = np.asarray(np.reshape(standalone_caption_matrix, (batch_size, n_image, -1)))
                    standalone_caption_seq_length = np.asarray(np.reshape(standalone_caption_seq_length, 
                                                                          (batch_size, n_image, -1)))
                    caption_masks = np.asarray(np.reshape(caption_masks, (batch_size, n_image, -1)))
                    #captions_masks = np.zeros( (caption_matrix.shape[0], caption_matrix.shape[1], caption_matrix.shape[2]) )
                    
                    _, loss_word, acc_word = sess.run(
                        [train_op, tf_loss, tf_acc],
                        feed_dict={
                            tf_feats: features,
                            tf_caption: caption_matrix,
                            tf_caption_masks:caption_masks,
                            tf_description: standalone_caption_matrix, 
                            tf_desc_seq_length: standalone_caption_seq_length
                        })

                    loss = loss + loss_word
                    acc = acc+acc_word
                    total_pass = total_pass + 1
                    n_item = 0
                    features = []
                    salient_features = []
                    caption_matrix = []
                    standalone_caption_matrix = []
                    standalone_caption_seq_length = []
                    caption_masks = []
                    # running information
            
            loss = loss/total_pass
            acc = acc/total_pass
            
            print ('Epoch: ', epoch, ' loss: ', loss, " accuracy: ", acc, ' Time cost: ', str((time.time() - start_time)))
            
            if np.mod(epoch, 50) == 0:
                lossFileName = "result/base_model_v2_loss"+str(epoch)+".txt"
                test_fd = open(lossFileName, 'w')
                test_fd.write('Epoch: '+ str(epoch) + ' loss: ' + str(loss)+ " accuracy: "+ str(acc)+"\n")
                test_fd.close()
                print ("Epoch ", epoch, " is done. Saving the model ...")
                saver.save(sess, os.path.join(model_path, 'mask_base_model_v2'), global_step=epoch)
    
    

In [None]:
train_model()

Epoch:  0  loss:  73.88367115813098  accuracy:  0.07083074492851085  Time cost:  738.592780828476
Epoch  0  is done. Saving the model ...
Epoch:  1  loss:  67.39690966473401  accuracy:  0.07465692939799143  Time cost:  732.4979486465454
Epoch:  2  loss:  65.73373013237048  accuracy:  0.0809482317537516  Time cost:  732.2590456008911
Epoch:  3  loss:  64.02773556923714  accuracy:  0.08837207774502051  Time cost:  732.6200723648071
Epoch:  4  loss:  62.43272736445229  accuracy:  0.09399853152177064  Time cost:  732.6246428489685
Epoch:  5  loss:  61.00336930481218  accuracy:  0.0981344019383894  Time cost:  732.6160655021667
Epoch:  6  loss:  59.84262761314115  accuracy:  0.10111182544388403  Time cost:  732.6410663127899
Epoch:  7  loss:  58.896631261266016  accuracy:  0.10350959401027293  Time cost:  732.5374803543091
Epoch:  8  loss:  58.112417029109196  accuracy:  0.10550816653882666  Time cost:  733.0070638656616
Epoch:  9  loss:  57.449177356041986  accuracy:  0.10715474329521875  

In [49]:
def generate_perplexity_model():
    batch_feats = tf.placeholder(tf.float32, [1, n_image, feature_dimension])
    batch_caption = tf.placeholder(tf.int32, [1, n_image, N_max_word + 1])
    batch_description = tf.placeholder(tf.int32, [1, n_image, N_max_word + 1])
    batch_sequence_length = tf.placeholder(tf.int32, [1, n_image, 1])
    
    desc_state = description_lstm.zero_state(batch_size=1, dtype=tf.float32)
    img_state = img_lstm.zero_state(batch_size=1, dtype=tf.float32)
    word_state = word_lstm.zero_state(batch_size=1, dtype=tf.float32)
    
    paragraph = []
    paragraph_probabilities = []
    
    for i in range(n_image):
        feats = batch_feats[:,i:i+1,:]
        caption = batch_caption[:,i:i+1,:]
        description = batch_description[:,i:i+1,:]
        sequence_length = batch_sequence_length[:,i:i+1,:]
        
        
        feats = tf.reshape(feats, [1, -1])
        caption = tf.reshape(caption, [1, -1])
        description = tf.reshape(caption, [1, -1])
        sequence_length = tf.reshape(sequence_length, [-1])
        
        description_embed = tf.nn.embedding_lookup(W_embeddings, description)
        
        with tf.variable_scope('description_context'):
            desc_outputs, desc_state = tf.nn.dynamic_rnn(description_lstm, inputs = description_embed, sequence_length= sequence_length,
                                                         initial_state=desc_state,
                                                        dtype=tf.float32)
        
        image_vec = tf.nn.relu(tf.matmul(feats, regionPooling_W1) + regionPooling_b1)
        with tf.variable_scope('image_context'):
            img_output, img_state = img_lstm(image_vec, img_state)
        
        decoder_input = img_state[1].h
        
        with tf.variable_scope('description_encoder'):
            desc_encoder_output, desc_encoder_state = description_context_lstm(desc_state[1].h, word_state)
        
        state = tf.nn.rnn_cell.LSTMStateTuple(desc_encoder_state[1].h, desc_encoder_state[1].h)
        word_state = (state, state)
        
        generated_sent = []
        probabilities = []
        for j in range(0, N_max_word):
            if j>0:
                tf.get_variable_scope().reuse_variables()
            if j == 0:
                current_embed = decoder_input
                
            with tf.variable_scope('word'):
                word_output, word_state = word_lstm(current_embed, word_state)

            logit_words = tf.nn.xw_plus_b(word_output, embed_word_W, embed_word_b)
            prediction = tf.nn.softmax(logit_words)
            max_prob_index = tf.argmax(prediction, 1)[0]

            generated_sent.append(max_prob_index)
            probabilities.append(prediction)
            with tf.device('/cpu:0'):
                current_embed = tf.nn.embedding_lookup(W_embeddings, caption[:,j])
                #current_embed = tf.expand_dims(current_embed, 0)
        
        paragraph.append(generated_sent)
        paragraph_probabilities.append(probabilities)

    return batch_feats, batch_caption, batch_description, batch_sequence_length, paragraph, paragraph_probabilities

In [50]:
def perplexity(descriptions, probabilities):
    
    idx = 0
    story_probability = 0
    N = 0
    for description in descriptions:
        caption_matrix = get_caption_matrix(description)
        j = 0
        for word_id in caption_matrix[1:]:
            probability_arr = probabilities[idx][j][0]
            probability = probability_arr[word_id]
            j +=1
            
            story_probability += math.log(probability,2)
            N +=1
        
        idx +=1
        
    story_probability /= N
    perplexity = math.pow(2,-story_probability)
    print("perplexity",perplexity)
    return perplexity

In [51]:
import random
def get_randomtext():
    sent = ""
    for i in range(N_max_word):
        word = random.randint(4,vocab_size-1)
        text = idx2word[word]
        sent = sent + text + " "
    sent = sent.strip()
    return sent

In [52]:
def format_sentance(generated_sentence):
        each_sent = []
        for sent in generated_sentence:
            for word_indices in sent:
                #print(word_indices)
                #print(word_indices)
                #for word_index in word_indices:
                    #print(idx2word[word_index])
                each_sent.append(idx2word[word_indices])
            
        current_sent = ''
        for each_word in each_sent:
            current_sent += each_word + ' '
        
        current_sent = current_sent.replace('eos', '')
        current_sent = current_sent.replace('pad', '')
        current_sent = current_sent + '.'
        current_sent = current_sent.replace(' .', '.')
        current_sent = current_sent.replace(' ,', ',')
        return current_sent
    

In [54]:

def test():
    start_time = time.time()
    model_path = 'models_batch/mask_base_model_v2-500'

    tf_feats, tf_caption, tf_description, tf_desc_seq_length, tf_generated_sent, tf_generated_probabilities = generate_perplexity_model()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    #saver = tf.train.import_meta_graph('../model_batch_v2/base_model_v2-1500.meta')
    saver.restore(sess, model_path)
    
    test_fd = open('HRNN_results_mask.txt', 'w')
    output_sentences = dict()
    part = 0
    for key, value in splitted_validation_data.items():
        if part == 50:
            break
        part +=1
        features = []
        photo_ids = []
        descriptions = []
        caption_matrix = []
        standalone_caption_matrix = []
        standalone_caption_seq_length = []
        for i in range(n_image):
            description = value[i]['description']
            standalone_description = value[i]['stndalone_description']
            #description = get_randomtext()
            org_features = value[i]['features']
            seq_length = get_sequence_length(standalone_description)
            
            org_features = org_features.flatten()
            
            photo_id = value[i]['photo_id']
            
            _caption_matrix = get_caption_matrix(description)
            _standalone_description_matrix = get_caption_matrix(standalone_description)
            
            #_features = np.concatenate([org_features, _salient_features])
            features.append(org_features)
            caption_matrix.append(_caption_matrix)
            standalone_caption_matrix.append(_standalone_description_matrix)
            standalone_caption_seq_length.append(seq_length)
            
            photo_ids.append(photo_id)
            descriptions.append(description)
            
        features = np.array(features)
        features = np.asarray(np.reshape(features, (1, n_image, -1)))
        caption_matrix = np.array(caption_matrix)
        caption_matrix = np.asarray(np.reshape(caption_matrix, (1, n_image, -1)))
        
        standalone_caption_matrix = np.array(standalone_caption_matrix)
        standalone_caption_seq_length = np.array(standalone_caption_seq_length)
        standalone_caption_matrix = np.asarray(np.reshape(standalone_caption_matrix, (1, n_image, -1)))
        standalone_caption_seq_length = np.asarray(np.reshape(standalone_caption_seq_length, 
                                                                          (1, n_image, -1)))
        
        test_fd.write("story: " + key + '\n')
        test_fd.write("photo_ids: ")
        for ids in photo_ids:
            test_fd.write(ids + " ")
        test_fd.write("\n")
            
        
        each_paragraph = []
        current_paragraph = ""
        
        
        generated_sentence_indexes, probabilites = sess.run([tf_generated_sent, tf_generated_probabilities], feed_dict={
            tf_feats: features,
            tf_caption: caption_matrix,
            tf_description: standalone_caption_matrix, 
            tf_desc_seq_length: standalone_caption_seq_length
        })
            
        #print(generated_sentence_indexes)
        #for paragraph in generated_sentence_indexes:
         #   for sentence_list in paragraph:
          #      for word_indices in sentence_list[0]:
           #         each_sent = []
            #        for word_index in word_indices:
             #           each_sent.append(idx2word[word_index])
              #      each_paragraph.append(each_sent)
        for sentence_list in generated_sentence_indexes:
            each_sent = []
            for word_index in sentence_list:
                each_sent.append(idx2word[word_index])
            each_paragraph.append(each_sent)
            
        current_sent = ''
        for each_sent in each_paragraph:
            for each_word in each_sent:
                current_sent += each_word + ' '
                
            current_sent = current_sent.replace('eos', '')
            current_sent = current_sent.replace('pad', '')
            current_sent = current_sent + '.'
            current_sent = current_sent.replace(' .', '.')
            current_sent = current_sent.replace(' ,', ',')
            current_sent += '\n'
            
            
        test_fd.write(" generated text: " + current_sent + '\n')
        test_fd.write(" original text: " + '\n')
        for description in descriptions:
            test_fd.write(description + '\n')
        test_fd.write("\n")
        per = perplexity(descriptions, probabilites)
        test_fd.write("perplexity: " + str(per) +'\n')
        test_fd.write("\n")
        
        print(current_sent)
        print(descriptions)
        output_sentences[key] = current_sent
        
    test_fd.close()
    print ("Time cost: " + str(time.time()-start_time))
    return output_sentences

In [55]:
output = test()

INFO:tensorflow:Restoring parameters from models_batch/mask_base_model_v2-500
perplexity 1738067454.8667233
bos bos family group was all  to  local    the the the bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos bos met rode to to unk the of    the the unk unk bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos we were kids were outside with that the first is a   the the by by by bos bos bos bos bos bos bos bos bos bos bos.
bos a a unk unk the was heads of around of their their long them them bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos after kids the with a few photo and   each by by by bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos.

['a sunny day in location began at the glenlivet factory', 'we then set out to see some gaelic ruins', 'and the kids stood in awe of the beautiful highland views', 'even more beautiful views along the nearby streams', 'the day ended with a family horseback outing']
perplexity 95927493668.98123
bos bos was

perplexity 576629145.9871615
bos got bought a daughter to the weekend of  the   the with the bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos so bought to the the friend and and and nice and and    this this bos bos bos bos bos bos bos bos bos bos bos bos.
bos when unk unk for his for the one as  a  so bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos they saw at the beautiful building  went fun of entire from    end bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos the then some most before so fireworks the park  a  the first first they they they bos bos bos bos bos bos bos bos bos bos bos.

['i took my family to a holiday carnival at halloween', 'i clowned around getting my picture taken with a painted wooden model', 'my daughter posed with people in superhero costumes', 'we walked through a costume carnival and saw all the people', 'we finished the night by looking at awesomely carved pumpkins']
perplexity 3281582367.4923306
bos celebrating best

perplexity 8133953.666839764
bos i we we decided a way to location museum at  location the of of kid days went    a week his his bos bos bos bos.
bos i is of a field of a i i of it or or of to  to  to  bos bos bos bos bos bos bos bos bos.
bos ` eve a a statue statue unk thought it spent a a a a a the great spot but  the the the bos bos bos bos bos bos.
bos i i i brought to to a historical to it next is at unk unk of    we we bos bos bos bos bos bos bos bos.
bos next of find to way to to to relax and are to the at the at but bos bos bos bos bos bos bos bos bos bos bos bos.

['today a we took the train to a random station to see what kind of adventures we could find', 'it turns out a it was a pretty rural area a but it sure was pretty', 'we came to one little town a and we had fun driving around it for a bit', 'then a we went back to another station a but it turned out it was empty', 'so we started our long drive back to where we came from']
perplexity 3834032.5669613676
bos i took to go

perplexity 848164221.8806938
bos i is my little is my   have have the  the house in in in in in bos bos bos bos bos bos bos bos bos bos.
bos the and location unk a we us a picture a a with week male male bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos our of to a river to huge decided out at remember again   it it bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos here up are are to and this this their his bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos bos.
bos we we a we we the which was very a have a such dog have me dog that   it with with bos bos bos bos bos bos.

['today is a day for skiing if we can just dig out the car', 'finally on the road and head for the slopes', 'just ahead are the lift a i cant wait to get on', 'mountain here i come', 'that was quite a run a it is time to refuel so i can hit the slopes again']
perplexity 9836971461.24617
bos i i had to a little male went a go up up mountains  unk   the  the bos bos bos bos bos bo

perplexity 5804584.53669686
bos i my my with female female he go for unk for for snacks unk and and  and and and bos bos bos bos bos bos bos bos bos.
bos the son had to little i not going first in his were in were      in in bos bos bos bos bos bos bos bos.
bos we is always nice a be as be this where a a  a a a the   it held wanted find for good would  but.
bos we we my after a visit we town is some a a a the can unk unk   the the unk a there as  have there for.
bos there at at and so so you of to unk a you is got unk unk for for it do  a a fun a    .

['snowed in again a no way to get the car out of the driveway today', 'my neighbor did a good job on their driveway but i think they had some help', "what is a mother to do to on a snowed in day to keep all the kids happy so they do n't fight a i got a bright idea", "clicked on 'youtube ' and found the recipe for homemade play dough a i have some green pool aid in the cabinets so green it will be", "came out pretty good a looks good enou