First, make sure you can inport all the required packages before proceeding any further.

In [51]:
import collections
import random
import bz2
import getopt
import sys
import os
import math
import numpy as np
from io import open 
import tensorflow as tf
from BatchReader import BatchReader

Caption files are extracted from MsCOCO using the provided API and compressed in bz2.
The visual features are extracted from the fc6 layer of the hybrid CNN (if you don't want/can extrat your own features, we will be happy to share ours!)

In [52]:
tr_captions_file = "mscoco/train2014.sentences.txt.bz2"
val_captions_file = "mscoco/val2014.sentences.txt.bz2"
tr_visual_embeddings_file = "visualembeddings/COCO_hybridCNN_fc6.dat.txt"
val_visual_embeddings_file = "visualembeddings/COCO_val2014_hybridCNN_fc6.dat.txt"
outpath = "Text2VisPredictions"
modelpath = "ModelParameters"

batch_size=100
use_dropout=False
keep_prob_val = 0.5 #only when use_dropout=True
valid_size=5000 
test_size =20000 
prob_visual_loss=0.5
hidden_size=1024
output_dim=4096 

l2factor=0.00000001

if not os.path.exists(outpath):
    os.makedirs(outpath)
if not os.path.exists(modelpath):
    os.makedirs(modelpath)

Init the BatchReader. This will instantiate one reader for the captions and other for the images.
Whenever a new batch is requested, the BatchReader object samples 'batch_size' images and returns it.

In [53]:
#The training file (captions and visual embeddings) are used as training data, whereas the validation file (captions and visual embeddings) is split into validation and test

print("Instantiating the batch-reader")
batch = BatchReader(tr_captions_file, val_captions_file, tr_visual_embeddings_file, val_visual_embeddings_file, batch_size=batch_size, random_caption_samples_from_image=1, valid_size=valid_size, test_size=test_size)

Instantiating the batch-reader
Reading captions file <mscoco/train2014.sentences.txt.bz2>


[Done] Read 82783 images-ids, 414113 captions 4339907 words, 5.00 captions/image, 10.48 words/caption
Reading captions file <mscoco/val2014.sentences.txt.bz2>


[Done] Read 123287 images-ids, 616767 captions 6462616 words, 5.00 captions/image, 10.48 words/caption
Building captions indexes


Vocabulary (min_word_occurrences>=5) has length 10358


In [54]:
#get the validation and test sets
valid_input_cap, valid_out_cap, valid_out_visual, inputCaptionOffset, _ , valid_img_labels = batch.getValidationSet() 
test_input_cap, _, _, test_caption_offsets, _, test_img_labels = batch.getTestSet()
print("[Done!]")

Reading the validation set [500 images]


Reading the test set [200 images]


[Done!]


In [55]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# GRAPH
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------

graph=tf.Graph()
with graph.as_default():
  # Input/Output data.
  #-------------------------------------------------------
  caption_input  = tf.placeholder(tf.float32, shape=[batch.batch_size, batch.vocabulary_size])
  caption_output = tf.placeholder(tf.float32, shape=[batch.batch_size, batch.vocabulary_size])
  visual_embedding_output = tf.placeholder(tf.float32, shape=[batch.batch_size, output_dim])
  
  caption_validation_input = tf.constant(valid_input_cap, tf.float32, shape=[valid_size, batch.vocabulary_size])
  caption_validation_output = tf.constant(valid_out_cap, tf.float32, shape=[valid_size, batch.vocabulary_size])
  visual_validation_output = tf.constant([x.tolist() for x in valid_out_visual], tf.float32, shape=[valid_size, output_dim])
  
  caption_test_input = tf.constant(test_input_cap, tf.float32, shape=[test_size, batch.vocabulary_size])
  
  global_step = tf.placeholder(tf.float32) #training iteration    
  keep_prob =  tf.placeholder(tf.float32) #dropout keep-probability
  
  # Model parameters
  #-------------------------------------------------------
  #caption-embedding
  cap2vec_weights = tf.Variable(tf.truncated_normal([batch.vocabulary_size, hidden_size], stddev=1.0 / math.sqrt(hidden_size)), name="cap2vec_weights") 
  cap2vec_biases = tf.Variable(tf.zeros([hidden_size]), name="cap2vec_biases")
  
  #embedding-caption
  vec2cap_weights = tf.Variable(tf.truncated_normal([hidden_size, batch.vocabulary_size], stddev=1.0 / math.sqrt(batch.vocabulary_size)))
  vec2cap_biases = tf.Variable(tf.zeros([batch.vocabulary_size]))
  
  #embedding-visual
  cemb2vemb_weights = tf.Variable(tf.truncated_normal([hidden_size, output_dim], stddev=1.0 / math.sqrt(output_dim)), name="cemb2vemb_weights")
  cemb2vemb_biases = tf.Variable(tf.zeros([output_dim]), name="cemb2vemb_biases")
  
  # Add ops to save and restore all the variables.
  saver = tf.train.Saver({"cap2vec_weights":cap2vec_weights, "cap2vec_biases":cap2vec_biases, "cemb2vemb_weights":cemb2vemb_weights, "cemb2vemb_biases":cemb2vemb_biases})
  
  # NNet 
  #-------------------------------------------------------
  def drop(tensor):
	return tf.nn.dropout(tensor, keep_prob) if use_dropout else tensor

  def nnet(cap_input):
	caption_embeddings = tf.nn.relu(tf.matmul(cap_input, cap2vec_weights) + cap2vec_biases)
	caption_reconstruc = tf.nn.relu(tf.matmul(caption_embeddings, vec2cap_weights) + vec2cap_biases)
	visual_prediction  = tf.nn.relu(tf.matmul(drop(caption_embeddings), cemb2vemb_weights) + cemb2vemb_biases) #+ (tf.matmul(caption_input, raw2vemb_weights) + raw2vemb_biases)
	return caption_reconstruc, visual_prediction
	
  caption_reconstruc, visual_prediction = nnet(caption_input) 
  
  # Losses functions
  #-------------------------------------------------------
  l2loss = l2factor*(tf.nn.l2_loss(cemb2vemb_weights) + tf.nn.l2_loss(cemb2vemb_biases)) if l2factor>=0 else 0.0
  visual_loss = tf.reduce_mean(tf.square(visual_prediction - visual_embedding_output)) + l2loss
  caption_loss = tf.reduce_mean(tf.square((caption_output - caption_reconstruc)))
  loss = visual_loss+caption_loss
  
  # Optimizers
  #-------------------------------------------------------
  visual_optimizer = tf.train.AdamOptimizer().minimize(visual_loss)
  caption_optimizer = tf.train.AdamOptimizer().minimize(caption_loss)
  
  
  # Validation graph
  #-------------------------------------------------------
  caption_validation_reconstruc, visual_validation_prediction = nnet(caption_validation_input) 
  validation_caption_loss = tf.reduce_mean(tf.square((caption_validation_input - caption_validation_reconstruc)))
  validation_visual_loss = tf.reduce_mean(tf.square(visual_validation_prediction - visual_validation_output)) + l2loss
   
  # Test graph
  #-------------------------------------------------------
  caption_test_reconstruc, visual_test_prediction = nnet(caption_test_input) 
  
  print("Graph built!")

Graph built!


In [56]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# TRAINING
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------

num_steps = 300001
valid_step=100
save_step=1000

def predictionsFileName(outpath):
	global step
	losstype = "_StochasticLoss"+str(prob_visual_loss) if prob_visual_loss < 1.0 else "_VisualLoss"
	visualfeatsfile = tr_visual_embeddings_file[tr_visual_embeddings_file.rfind('/')+1:]
	hidden_dim="_H"+str(hidden_size)
	drop_info= "_Drop"+str(keep_prob_val) if use_dropout else ""
	return outpath+"/Text2Vis"+hidden_dim+losstype+drop_info+"_"+visualfeatsfile
	
def saveResults(outpath):
  filename = predictionsFileName(outpath)
  with open(filename+".val", 'w') as val_file:
    val_file.write(u'It\tTrVL\tTrCL\tVaVL\tVaCL\n')
    for (t,tvl,tcl,vvl,vcl) in valid_values:
		val_file.write(u'%d\t%f\t%f\t%f\t%f\n' % (t,tvl,tcl,vvl,vcl))

best_valid_cap_loss, best_valid_visual_loss, last_saved_visual_loss = 1000,1000,1000
valid_values = []
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  loss_ave, l2_ave, tr_vl_ave, tr_cl_ave = 0, 0, 0 , 0
  
  for step in range(num_steps):
   
    input_captions, output_captions, output_visual,_,_,_ = batch.nextBatch()    
    fd = {caption_input : input_captions, caption_output : output_captions, visual_embedding_output : output_visual, global_step : step, keep_prob : keep_prob_val}
	
    to_optimize = visual_optimizer if random.random() < prob_visual_loss else caption_optimizer
	
    _, l, l2, cl, vl  = session.run([to_optimize, loss, l2loss, caption_loss, visual_loss], feed_dict=fd)
    
    l2_ave += l2 
    loss_ave += l
    tr_vl_ave += vl
    tr_cl_ave += cl
    
    if step % valid_step == 0:
		loss_ave /= valid_step
		tr_vl_ave /= valid_step
		tr_cl_ave /= valid_step
		va_vl_ave  = validation_visual_loss.eval({keep_prob : 1.0})
		va_cl_ave = validation_caption_loss.eval({keep_prob : 1.0})
		best_valid_cap_loss = min (best_valid_cap_loss, va_cl_ave)
		best_valid_visual_loss = min (best_valid_visual_loss, va_vl_ave)
		saveResults(outpath)
		if step % save_step == 0 and best_valid_visual_loss < last_saved_visual_loss:
			last_saved_visual_loss=best_valid_visual_loss
			model_file = predictionsFileName(modelpath)+".ckpt"
			save_path = saver.save(session, model_file)
			print("Model saved in file: %s" % save_path)
		print('Average loss at step %d: Loss=%f Training [vl=%f cl=%f l2=%f dr=%r]\tValidation[vl=%f (b=%f) cl=%f (b=%f)]' % (step, loss_ave, tr_vl_ave, tr_cl_ave, l2_ave, use_dropout, va_vl_ave, best_valid_visual_loss, va_cl_ave, best_valid_cap_loss))
		valid_values.append((step,tr_vl_ave,tr_cl_ave,va_vl_ave,va_cl_ave))
		loss_ave, l2_ave, tr_vl_ave, tr_cl_ave = 0, 0, 0, 0
  
  print("[Done!]")

Initialized


Model saved in file: ModelParameters/Text2Vis_H256_StochasticLoss0.5_COCO_hybridCNN_fc6.dat.txt.ckpt
Average loss at step 0: Loss=1.739591 Training [vl=1.739486 cl=0.000105 l2=0.000001 dr=False]	Validation[vl=17.431971 (b=17.431971) cl=0.001038 (b=0.001038)]


Average loss at step 10: Loss=17.626478 Training [vl=17.625590 cl=0.000888 l2=0.000010 dr=False]	Validation[vl=17.336010 (b=17.336010) cl=0.000818 (b=0.000818)]


Average loss at step 20: Loss=17.560132 Training [vl=17.559314 cl=0.000818 l2=0.000011 dr=False]	Validation[vl=17.178839 (b=17.178839) cl=0.000769 (b=0.000769)]


Average loss at step 30: Loss=16.772609 Training [vl=16.771820 cl=0.000790 l2=0.000013 dr=False]	Validation[vl=16.583117 (b=16.583117) cl=0.000753 (b=0.000753)]


Average loss at step 40: Loss=16.701923 Training [vl=16.701146 cl=0.000777 l2=0.000016 dr=False]	Validation[vl=16.183655 (b=16.183655) cl=0.000731 (b=0.000731)]


Average loss at step 50: Loss=15.844191 Training [vl=15.843412 cl=0.000778 l2=0.000022 dr=False]	Validation[vl=15.509085 (b=15.509085) cl=0.000727 (b=0.000727)]


Average loss at step 60: Loss=15.656494 Training [vl=15.655717 cl=0.000776 l2=0.000029 dr=False]	Validation[vl=15.067328 (b=15.067328) cl=0.000720 (b=0.000720)]


Average loss at step 70: Loss=15.130696 Training [vl=15.129933 cl=0.000762 l2=0.000037 dr=False]	Validation[vl=14.774522 (b=14.774522) cl=0.000708 (b=0.000708)]


Average loss at step 80: Loss=15.266150 Training [vl=15.265400 cl=0.000750 l2=0.000043 dr=False]	Validation[vl=14.737125 (b=14.737125) cl=0.000684 (b=0.000684)]


Average loss at step 90: Loss=14.393732 Training [vl=14.392989 cl=0.000743 l2=0.000048 dr=False]	Validation[vl=14.601666 (b=14.601666) cl=0.000674 (b=0.000674)]


Model saved in file: ModelParameters/Text2Vis_H256_StochasticLoss0.5_COCO_hybridCNN_fc6.dat.txt.ckpt
Average loss at step 100: Loss=14.664913 Training [vl=14.664188 cl=0.000726 l2=0.000051 dr=False]	Validation[vl=14.556973 (b=14.556973) cl=0.000667 (b=0.000667)]
[Done!]


In [60]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# TEST
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------

filename = predictionsFileName(outpath)+".pred"

with tf.Session(graph=graph) as session:
  # Restore variables from disk if needed!
  print('Loading model parameters from %s' % model_file)
  saver.restore(session, model_file)
  
  print('Obtaining the predictions...')
  predictions = visual_test_prediction.eval({keep_prob : 1.0})
  
  print('Saving results in %s...' % filename)
  with open(filename, 'w') as pred_file:
    test_written=0
    for i in xrange(len(predictions)):
		pred_str = (' '.join(('%.3f' % x) for x in predictions[i])).replace(" 0.000", " 0")
		img_id= test_img_labels[i]
		cap_id= test_caption_offsets[i]
		cap_txt=batch.captions_orig[img_id][cap_id]
		pred_file.write("%s\t%d\t%s\t%s\n" % (img_id, cap_id, cap_txt, pred_str))
		test_written += 1
		if test_written % (test_size/20) == 0:
			print("...%d%% completed" % int(test_written*100.0/test_size))	
  
  print("[Done!]")

Loading model parameters from ModelParameters/Text2Vis_H256_StochasticLoss0.5_COCO_hybridCNN_fc6.dat.txt.ckpt


Obtaining the predictions...
Saving results in Text2VisPredictions/Text2Vis_H256_StochasticLoss0.5_COCO_hybridCNN_fc6.dat.txt.pred...
...5% completed
...10% completed
...15% completed
...20% completed
...25% completed


...30% completed
...35% completed
...40% completed
...45% completed
...50% completed
...55% completed
...60% completed
...65% completed
...70% completed
...75% completed
...80% completed


...85% completed
...90% completed
...95% completed
...100% completed
[Done!]
