In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""
import tensorflow as tf
import pandas as pd
import numpy as np
import _pickle as cPickle

print("This notebook uses TensorFlow version {}".format(tf.__version__))

  from ._conv import register_converters as _register_converters


This notebook uses TensorFlow version 1.11.0


In [None]:
vocab = cPickle.load(open('dataset/text/vocab.pkl', 'rb'))
print('total {} vocabularies'.format(len(vocab)))

In [None]:
def count_vocab_occurance(vocab, df):
    voc_cnt = {v: 0 for v in vocab}
    for img_id, row in df.iterrows():
        for w in row['caption'].split(' '):
            voc_cnt[w] += 1
    return voc_cnt


df_train = pd.read_csv(os.path.join('dataset', 'train.csv'))

print('count vocabulary occurances...')
voc_cnt = count_vocab_occurance(vocab, df_train)

# remove words appear < 50 times
thrhd = 50
x = np.array(list(voc_cnt.values()))
print('{} words appear >= 50 times'.format(np.sum(x[(-x).argsort()] >= thrhd)))

In [None]:
def build_voc_mapping(voc_cnt, thrhd):
    """
    enc_map: voc --encode--> id
    dec_map: id --decode--> voc
    """

    def add(enc_map, dec_map, voc):
        enc_map[voc] = len(dec_map)
        dec_map[len(dec_map)] = voc
        return enc_map, dec_map

    # add <ST>, <ED>, <RARE>
    enc_map, dec_map = {}, {}
    for voc in ['<ST>', '<ED>', '<RARE>']:
        enc_map, dec_map = add(enc_map, dec_map, voc)
    for voc, cnt in voc_cnt.items():
        if cnt < thrhd:  # rare words => <RARE>
            enc_map[voc] = enc_map['<RARE>']
        else:
            enc_map, dec_map = add(enc_map, dec_map, voc)
    return enc_map, dec_map


enc_map, dec_map = build_voc_mapping(voc_cnt, thrhd)
# save enc/decoding map to disk
cPickle.dump(enc_map, open('dataset/text/enc_map.pkl', 'wb'))
cPickle.dump(dec_map, open('dataset/text/dec_map.pkl', 'wb'))

In [None]:
def caption_to_ids(enc_map, df):
    img_ids, caps = [], []
    for idx, row in df.iterrows():
        icap = [enc_map[x] for x in row['caption'].split(' ')]
        icap.insert(0, enc_map['<ST>'])
        icap.append(enc_map['<ED>'])
        img_ids.append(row['img_id'])
        caps.append(icap)
    return pd.DataFrame({
              'img_id': img_ids,
              'caption': caps
            }).set_index(['img_id'])


enc_map = cPickle.load(open('dataset/text/enc_map.pkl', 'rb'))
print('[transform captions into sequences of IDs]...')
df_proc = caption_to_ids(enc_map, df_train)
df_proc.to_csv('dataset/text/train_enc_cap.csv')

In [None]:
df_cap = pd.read_csv(
    'dataset/text/train_enc_cap.csv')  # a dataframe - 'img_id', 'cpation'
enc_map = cPickle.load(
    open('dataset/text/enc_map.pkl', 'rb'))  # token => id
dec_map = cPickle.load(
    open('dataset/text/dec_map.pkl', 'rb'))  # id => token
vocab_size = len(dec_map)


def decode(dec_map, ids):
    """decode IDs back to origin caption string"""
    return ' '.join([dec_map[x] for x in ids])


print('decoding the encoded captions back...\n')
for idx, row in df_cap.iloc[:8].iterrows():
    print('{}: {}'.format(idx, decode(dec_map, eval(row['caption']))))

In [None]:
def create_tfrecords(df_cap, img_df, filename, num_files=5):
    ''' create tfrecords for dataset '''

    def _float_feature(value):
        return tf.train.Feature(
            float_list=tf.train.FloatList(value=value))

    def _int64_feature(value):
        return tf.train.Feature(
            int64_list=tf.train.Int64List(value=value))

    num_records_per_file = img_df.shape[0] // num_files

    total_count = 0

    print("create training dataset....")
    for i in range(num_files):
        # tfrecord writer: write record into files
        count = 0
        writer = tf.python_io.TFRecordWriter(
            filename + '-' + str(i + 1) +'.tfrecords')
        
        # start point (inclusive)
        st = i * num_records_per_file  
        # end point (exclusive)
        ed = (i + 1) * num_records_per_file if i != num_files - 1 else img_df.shape[0]  

        for idx, row in img_df.iloc[st:ed].iterrows():
        
            # img representation in 256-d array format
            img_representation = row['img']  

            # each image has some captions describing it.
            for _, inner_row in df_cap[df_cap['img_id'] == row['img_id']].iterrows():
                # caption in different sequence length list format
                caption = eval(inner_row['caption'])  

                # construct 'example' object containing 'img', 'caption'
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'img': _float_feature(img_representation),
                        'caption': _int64_feature(caption)
                    }))

                count += 1
                writer.write(example.SerializeToString())
        print("create {}-{}.tfrecords -- contains {} records".format(
                                    filename, str(i + 1), count))
        total_count += count
        writer.close()
    print("Total records: {}".format(total_count))

In [None]:
import glob
training_filenames = glob.glob('dataset/tfrecords/train-*')

# get the number of records in training files
def get_num_records(files):
    count = 0
    for fn in files:
        for record in tf.python_io.tf_record_iterator(fn):
            count += 1
    return count

num_train_records = get_num_records(training_filenames)
print('Number of training records in all training file: {}'.format(
    num_train_records))

In [None]:
def training_parser(record):
    ''' parse record from .tfrecords file and create training record

    :args 
      record - each record extracted from .tfrecords
    :return
      a dictionary contains {
          'img': image array extracted from vgg16 (256-dim),
          'input_seq': a list of word id
                    which describes input caption sequence (Tensor),
          'output_seq': a list of word id
                    which describes output caption sequence (Tensor),
          'mask': a list of one which describe
                    the length of input caption sequence (Tensor)
      }
    '''

    keys_to_features = {
      "img": tf.FixedLenFeature([256], dtype=tf.float32),
      "caption": tf.VarLenFeature(dtype=tf.int64)
    }

    # features contains - 'img', 'caption'
    features = tf.parse_single_example(record, features=keys_to_features)

    img = features['img']
    caption = features['caption'].values
    caption = tf.cast(caption, tf.int32)

    # create input and output sequence for each training example
    # e.g. caption :   [0 2 5 7 9 1]
    #      input_seq:  [0 2 5 7 9]
    #      output_seq: [2 5 7 9 1]
    #      mask:       [1 1 1 1 1]
    caption_len = tf.shape(caption)[0]
    input_len = tf.expand_dims(tf.subtract(caption_len, 1), 0)

    input_seq = tf.slice(caption, [0], input_len)
    output_seq = tf.slice(caption, [1], input_len)
    mask = tf.ones(input_len, dtype=tf.int32)

    records = {
      'img': img,
      'input_seq': input_seq,
      'output_seq': output_seq,
      'mask': mask
    }

    return records

In [None]:
def tfrecord_iterator(filenames, batch_size, record_parser):
    ''' create iterator to eat tfrecord dataset 

    :args
        filenames     - a list of filenames (string)
        batch_size    - batch size (positive int)
        record_parser - a parser that read tfrecord
                        and create example record (function)

    :return 
        iterator      - an Iterator providing a way
                        to extract elements from the created dataset.
        output_types  - the output types of the created dataset.
        output_shapes - the output shapes of the created dataset.
    '''
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(record_parser, num_parallel_calls=16)

    # padded into equal length in each batch
    dataset = dataset.padded_batch(
      batch_size=batch_size,
      padded_shapes={
          'img': [None],
          'input_seq': [None],
          'output_seq': [None],
          'mask': [None]
      },
      padding_values={
          'img': 1.0,       # needless, for completeness
          'input_seq': 1,   # padding input sequence in this batch
          'output_seq': 1,  # padding output sequence in this batch
          'mask': 0         # padding 0 means no words in this position
      })  

    dataset = dataset.repeat()             # repeat dataset infinitely
    dataset = dataset.shuffle(3*batch_size)  # shuffle the dataset

    iterator = dataset.make_initializable_iterator()
    output_types = dataset.output_types
    output_shapes = dataset.output_shapes

    return iterator, output_types, output_shapes

In [None]:
def get_seq_embeddings(input_seq, vocab_size, word_embedding_size):
    with tf.variable_scope('seq_embedding'), tf.device("/cpu:0"):
        embedding_matrix = tf.get_variable(
            name='embedding_matrix',
            shape=[vocab_size, word_embedding_size],
            initializer=tf.random_uniform_initializer(minval=-1, maxval=1))
        # [batch_size, padded_length, embedding_size]
        seq_embeddings = tf.nn.embedding_lookup(embedding_matrix, input_seq)
    return seq_embeddings

In [None]:
class ImageCaptionModel(object):
    ''' simple image caption model '''

    def __init__(self, hparams):
        self.hps = hparams

    def _build_inputs(self):
        """ construct the inputs for model """
        self.filenames = tf.placeholder(tf.string,
                                        shape=[None], name='filenames')
        self.training_iterator, types, shapes = tfrecord_iterator(
          self.filenames, self.hps.batch_size, training_parser)

        self.handle = tf.placeholder(tf.string, shape=[], name='handle')
        iterator = tf.data.Iterator.from_string_handle(self.handle,
                                                       types, shapes)
        records = iterator.get_next()

        image_embed = records['img']
        image_embed.set_shape([None, self.hps.image_embedding_size])
        input_seq = records['input_seq']
        target_seq = records['output_seq']
        input_mask = records['mask']
        
        self.image_embed = image_embed # (batch_size, img_dim)
        self.input_seq = input_seq # (batch_size, seqlen)
        self.target_seq = target_seq # (batch_size, seqlen)
        self.input_mask = input_mask # (batch_size, seqlen)
            
        # convert sequence of index to sequence of embedding
        with tf.variable_scope('seq_embedding'), tf.device('/cpu:0'):
            self.embedding_matrix = tf.get_variable(
                    name='embedding_matrix',
                    shape=[self.hps.vocab_size,
                           self.hps.word_embedding_size],
                    initializer=tf.random_uniform_initializer(
                        minval=-1, maxval=1))
            # [batch_size, seqlen, embedding_size]
            seq_embeddings = tf.nn.embedding_lookup(
                self.embedding_matrix, self.input_seq)
        
    def _build_model(self):
        """ Build your image caption model """
        pass
        
    def build(self):
        """ call this function to build the inputs and model """
        self._build_inputs()
        self._build_model()
        
    def train(self, sess, training_filenames, num_train_records):
        """ write a training function for your model """
        pass
    
    def predict(self, sess, img_vec, dec_map):
        """ generate the caption given an image """
        pass

In [None]:
def get_hparams():
    hparams = tf.contrib.training.HParams(
      vocab_size=vocab_size,
      batch_size=64,
      rnn_units=100,
      image_embedding_size=256,
      word_embedding_size=256,
      drop_keep_prob=0.7,
      lr=1e-3,
      training_epochs=1,
      max_caption_len=15,
      ckpt_dir='model_ckpt/')
    return hparams

In [None]:
# get hperparameters
hparams = get_hparams()
# create model
model = ImageCaptionModel(hparams)
model.build()

In [None]:
# start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
model.train(sess, training_filenames, num_train_records)

In [None]:
tf.reset_default_graph()
model = ImageCaptionModel(hparams)
model.build()

# sample one image in training data and generate caption
testimg = img_train_df.iloc[9]['img']
testimg = np.expand_dims(testimg, axis=0)

with tf.Session(config=config) as sess:
    saver = tf.train.Saver()
    # restore variables from disk.
    ckpt = tf.train.get_checkpoint_state(hparams.ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess,
                      tf.train.latest_checkpoint(hparams.ckpt_dir))
        caption = model.predict(sess, testimg, dec_map)
        print(caption)
    else:
        print("No checkpoint found.")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import Image, display
from pretrained.cnn import PretrainedCNN
import imageio
import skimage.transform
import numpy as np
import scipy

def demo(img_path, cnn_mdl, U, dec_map, hparams, max_len=15):
    """
    displays the caption generated for the image
    -------------------------------
    img_path: image to be captioned
    cnn_mdl: path of the image feature extractor
    U: transform matrix to perform PCA
    dec_map: mapping of vocabulary ID => token string
    hparams: hyperparams for model
    """

    def process_image(img, crop=True, submean=True):
        """
        implements the image preprocess required by VGG-16
        -------------------------------
        resize image to 224 x 224
        crop: do center-crop [skipped by default]
        submean: substracts mean image of ImageNet [skipped by default]
        """
        MEAN = np.array([103.939, 116.779, 123.68]).astype(np.float32) # BGR
        # center crop
        short_edge = min(img.shape[:2])
        yy = int((img.shape[0] - short_edge) / 2)
        xx = int((img.shape[1] - short_edge) / 2)
        crop_img = img[yy: yy + short_edge, xx: xx + short_edge]
        img = skimage.transform.resize(crop_img, [224, 224, 3], mode="constant")
        img = img.reshape((224,224,1)) if len(img.shape) < 3 else img
        
        if img.shape[2] < 3:
            print('dimension insufficient')
            img = img.reshape((224*224,
                               img.shape[2])).T.reshape((img.shape[2],
                                                                 224*224))
            for i in range(img.shape[0], 3):
                img = np.vstack([img, img[0,:]])
            img = img.reshape((3,224*224)).T.reshape((224,224,3))
        img = img.astype(np.float32)
        img = img[:,:,::-1]
        # RGB => BGR
        for i in range(3):
            img[:,:,i] -= MEAN[i]
        return img.reshape((224,224,3))

    display(Image(img_path))
    img = imageio.imread(img_path)
    
    # load pretrained cnn model
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        img_feature = np.dot(
            cnn_mdl.get_output(sess, [process_image(img)])[0].reshape((-1)), U)
        
    # reset graph for image caption model
    tf.reset_default_graph()  
    model = ImageCaptionModel(hparams)
    model.build()
    with tf.Session(config=config) as sess:
        saver = tf.train.Saver()
        # restore variables from disk.
        ckpt = tf.train.get_checkpoint_state(hparams.ckpt_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, tf.train.latest_checkpoint(hparams.ckpt_dir))
            caption = model.predict(sess, img_feature, dec_map)
            print(' '.join(caption))
        else:
            print("No checkpoint found.")

In [None]:
tf.reset_default_graph()  # reset graph for cnn model
U = cPickle.load(open('dataset/U.pkl', 'rb'))  # PCA transforming matrix
vgg = PretrainedCNN('pretrained/vgg16_mat.pkl')
demo('demo/example1.jpg', vgg, U, dec_map, hparams)

In [None]:
def generate_captions(model, dec_map, img_test, max_len=15):
    img_ids, caps = [], []
  
    with tf.Session() as sess:
        saver = tf.train.Saver()
        # restore variables from disk.
        ckpt = tf.train.get_checkpoint_state(hparams.ckpt_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess,
                          tf.train.latest_checkpoint(hparams.ckpt_dir))
            for img_id, img in img_test.items():
                img_ids.append(img_id)
                caps.append(model.predict(sess, img, dec_map))
        else:
            print("No checkpoint found.")
    
    return pd.DataFrame({
              'img_id': img_ids,
              'caption': caps
            }).set_index(['img_id'])

In [None]:
# load test image  size=20548
img_test = cPickle.load(open('dataset/test_img256.pkl', 'rb'))

# create model
tf.reset_default_graph()
model = ImageCaptionModel(hparams)
model.build()

# generate caption to csv file
df_predict = generate_captions(model, dec_map, img_test)
df_predict.to_csv('generated/demo.csv')