### `_process_caption_data()`

In [158]:
import json
import pandas as pd
import os
caption_file = '/root/DLResearch/show-attend-and-tell/data/annotations/captions_train2014.json'
image_dir = '/root/DLResearch/show-attend-and-tell/image/train2014_resized/'
max_length = 100

In [159]:
def _process_caption_data(caption_file, image_dir, max_length):
    with open(caption_file) as f:
        caption_data = json.load(f)
        
    # join the dataset
    imagesDF = pd.DataFrame.from_dict(caption_data['images'])
    annotationsDF = pd.DataFrame.from_dict(caption_data['annotations'])
    imagesDF['file_name'] = imagesDF['file_name'].apply(lambda x: os.path.join(image_dir, x)) # append the directory
    imagesDF = imagesDF[['file_name', 'id']] # remove unnecessary cols
    captionDF = pd.merge(left=imagesDF, right=annotationsDF, left_on='id', right_on='image_id')
    captionDF = captionDF.drop(['id_x', 'id_y'], axis=1)
    captionDF.sort_values(by='image_id', inplace=True)
    
    # process texts
    process_fn_1 = lambda x: x.replace('.','').replace(',','').replace("'","").replace('"','') \
                              .replace('&','and').replace('(','').replace(")","").replace('-',' ')
    process_fn_2 = lambda x: " ".join(process_fn_1(x).split())
    process_fn_3 = lambda x: process_fn_2(x).lower()
    
    captionDF['caption'] = captionDF['caption'].apply(process_fn_3)
    
    # cut texts length
    captionDF['text_length'] = captionDF['caption'].apply(lambda x: len(x.split()))
    captionDF = captionDF[captionDF['text_length'] <= max_length]
    captionDF = captionDF.drop(['text_length'], axis=1)
    
    captionDF = captionDF.reset_index(drop=True)
    
    return captionDF
    
    

### TEST: `_process_caption_data()`
passed

In [160]:
import importlib
import originalPrepro
importlib.reload(originalPrepro)
caption_file = '/root/DLResearch/show-attend-and-tell/data/annotations/captions_train2014.json'
image_dir = '/root/DLResearch/show-attend-and-tell/image/train2014_resized/'
max_length = 7

In [172]:
%%time 
caption_gt = originalPrepro.__process_caption_data(caption_file, image_dir, max_length)

The number of captions before deletion: 414113
The number of captions after deletion: 3251
CPU times: user 5.36 s, sys: 40 ms, total: 5.4 s
Wall time: 5.4 s


In [333]:
%%time 
caption = _process_caption_data(caption_file, image_dir, max_length)

CPU times: user 3.95 s, sys: 148 ms, total: 4.1 s
Wall time: 4.1 s


In [173]:
caption_gt.head()

Unnamed: 0,caption,file_name,image_id
0,a giraffe standing up nearby a tree,/root/DLResearch/show-attend-and-tell/image/tr...,25
1,a number of giraffes near one another,/root/DLResearch/show-attend-and-tell/image/tr...,144
2,the woman is standing with her luggage,/root/DLResearch/show-attend-and-tell/image/tr...,260
3,a woman playing a video game indoors,/root/DLResearch/show-attend-and-tell/image/tr...,446
4,a person on the ground wearing skis,/root/DLResearch/show-attend-and-tell/image/tr...,897


In [334]:
caption.head()

Unnamed: 0,file_name,caption,image_id
0,/root/DLResearch/show-attend-and-tell/image/tr...,a giraffe standing up nearby a tree,25
1,/root/DLResearch/show-attend-and-tell/image/tr...,a number of giraffes near one another,144
2,/root/DLResearch/show-attend-and-tell/image/tr...,the woman is standing with her luggage,260
3,/root/DLResearch/show-attend-and-tell/image/tr...,a woman playing a video game indoors,446
4,/root/DLResearch/show-attend-and-tell/image/tr...,a person on the ground wearing skis,897


In [335]:
caption.sort_values(['image_id', 'caption'], inplace=True)
caption = caption.reset_index(drop=True)
caption_gt.sort_values(['image_id', 'caption'], inplace=True)
caption_gt = caption_gt.reset_index(drop=True)

In [336]:
for col in caption.columns:
    print(caption[col].equals(caption_gt[col]))

True
True
True


### `_build_vocab()`

In [174]:
import numpy as np
def _build_vocab(annotations, threshold=1):
    print('The Index will be different from the original version')
    words = []
    for sentence in annotations['caption']:
        [words.append(w) for w in sentence.split()]
    word_counts = pd.value_counts(np.array(words))
    word_counts_filtered = word_counts[word_counts >= threshold]
    
    idx = 3
    word_to_idx = {u'<NULL>': 0, u'<START>': 1, u'<END>': 2} 
    for word in word_counts_filtered.keys():
        word_to_idx[word] = idx
        idx += 1
    
    return word_to_idx

### TEST: `_build_vocab()`
passed

In [175]:
%%time
import importlib
import originalPrepro
importlib.reload(originalPrepro)
#caption_gt = originalPrepro.__process_caption_data(caption_file, image_dir, max_length)
word_to_idx_gt = originalPrepro._build_vocab(caption_gt)
word_to_idx = _build_vocab(caption_gt)

Filtered 2491 words to 2491 words with word count threshold 1.
Max length of caption:  7
The Index will be different from the original version
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 37.3 ms


In [176]:
assert len(word_to_idx_gt) == len(word_to_idx)
for w in word_to_idx_gt.keys():
    assert w in word_to_idx.keys()
for w in word_to_idx.keys():
    assert w in word_to_idx_gt.keys()

In [455]:
test_word = ['snowboarding', 'passengers', 'kids']
for w in test_word:
    print(word_to_idx.get(w))
    print(word_to_idx_gt.get(w))

283
3
683
15
167
12


### `_build_caption_vector()`

In [222]:
def _build_caption_vector(annotations, word_to_idx, max_length=15):
    # append indicators
    append_fn = lambda x: ' '.join(['<START>', x, '<END>'])
    annotations['_caption'] = annotations['caption'].apply(append_fn)
    # to vectors
    word_to_idx_fn = lambda sentence: [word_to_idx.get(word) for word in sentence.split() if word in word_to_idx]
    captions = annotations['_caption'].apply(word_to_idx_fn)

    C = np.zeros([annotations.shape[0], max_length+2], dtype=np.int32)
    for c in captions.items():
        idx = c[0]
        v = c[1]
        if(len(v) < max_length + 2):
            for _ in range(max_length + 2 - len(v)):
                v.append(word_to_idx['<NULL>'])
        C[idx, :] = v
            
    return C

### TEST: `_build_caption_vector()`
passed

In [223]:
%%time
import importlib
import originalPrepro
import numpy as np
importlib.reload(originalPrepro)
captions_vec_gt = originalPrepro._build_caption_vector(caption_gt, word_to_idx_gt, max_length=15)

Finished building caption vectors
CPU times: user 36 ms, sys: 8 ms, total: 44 ms
Wall time: 36.1 ms


In [224]:
%%time
captions_vec = _build_caption_vector(caption_gt, word_to_idx_gt, max_length=15)

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 30 ms


In [225]:
(captions_vec_gt == captions_vec).all()

True

In [226]:
captions_vec

array([[   1, 2457, 2059, ...,    0,    0,    0],
       [   1, 2457,  682, ...,    0,    0,    0],
       [   1, 2220, 1708, ...,    0,    0,    0],
       ..., 
       [   1, 1521, 1430, ...,    0,    0,    0],
       [   1, 2319, 1521, ...,    0,    0,    0],
       [   1,  465, 1475, ...,    0,    0,    0]], dtype=int32)

In [227]:
captions_vec_gt

array([[   1, 2457, 2059, ...,    0,    0,    0],
       [   1, 2457,  682, ...,    0,    0,    0],
       [   1, 2220, 1708, ...,    0,    0,    0],
       ..., 
       [   1, 1521, 1430, ...,    0,    0,    0],
       [   1, 2319, 1521, ...,    0,    0,    0],
       [   1,  465, 1475, ...,    0,    0,    0]], dtype=int32)

### `_build_file_names()`

In [183]:
def _build_file_names(annotations):
    file_names = []
    id_to_idx = {}
    
    idx = 0
    for image_id in annotations['image_id'].unique():
        id_to_idx[image_id] = idx
        file_name_id = annotations[annotations.image_id == image_id].index[0]
        file_names.append(annotations['file_name'][file_name_id])
        idx += 1
        
    file_names = np.array(file_names)
    return file_names, id_to_idx
    

### TEST: `_build_file_names()`
passed

In [184]:
%%time
file_names, id_to_idx = _build_file_names(caption_gt)

CPU times: user 1.77 s, sys: 0 ns, total: 1.77 s
Wall time: 1.77 s


In [190]:
%%time
import importlib
import originalPrepro
import numpy as np
importlib.reload(originalPrepro)
file_names_gt, id_to_idx_gt = originalPrepro._build_file_names(caption_gt)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.57 ms


In [186]:
(file_names_gt == file_names).all()

True

In [187]:
id_to_idx_gt == id_to_idx

True

### `_build_image_idxs()`

In [207]:
def _build_image_idxs(annotations, id_to_idx):
    image_idxs = np.ndarray(len(annotations), dtype=np.int32)
    
    for item in annotations['image_id'].items():
        idx = item[0]
        id = item[1]
        image_idxs[idx] = id_to_idx.get(id)
    return image_idxs

### TEST:`_build_image_idxs()`

In [208]:
%%time
import importlib
import originalPrepro
import numpy as np
importlib.reload(originalPrepro)
image_idxs_gt = originalPrepro._build_image_idxs(caption_gt, id_to_idx_gt)
image_idxs = _build_image_idxs(caption_gt, id_to_idx_gt)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.25 ms


In [209]:
(image_idxs == image_idxs_gt).all()

True

### Copied from Original Implementation

In [6]:
_vgg_model_path = '/root/DLResearch/show-attend-and-tell/data/imagenet-vgg-verydeep-19.mat'
caption_file_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/annotations/captions_%s2014.json'%split
image_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/image/%s2014_resized/'%split
save_annotation_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/%s/%s.annotations.pkl'%(split,split)
save_word_to_idx_fn = '/root/DLResearch/show-attend-and-tell/data/%s/word_to_idx.pkl'%'train'
save_captions_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/%s/%s.captions.pkl'%(split,split)
save_file_names_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/%s/%s.file.names.pkl'%(split,split)
save_image_idxs_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/%s/%s.image.idxs.pkl'%(split,split)
save_feature_to_captions_dir_fn = lambda split: '/root/DLResearch/show-attend-and-tell/data/%s/%s.references.pkl'%(split,split)

In [229]:
import sys
sys.path.append('/root/DLResearch/show-attend-and-tell/')
from core.utils import *


# batch size for extracting feature vectors from vggnet.
batch_size = 100
# maximum length of caption(number of word). if caption is longer than max_length, deleted.  
max_length = 15
# if word occurs less than word_count_threshold in training dataset, the word index is special unknown token.
word_count_threshold = 1
# vgg model path 
vgg_model_path = _vgg_model_path

# about 80000 images and 400000 captions for train dataset
train_dataset = _process_caption_data(caption_file=caption_file_dir_fn('train'),
                                      image_dir=image_dir_fn('train'), 
                                      max_length=max_length)

# about 40000 images and 200000 captions
val_dataset = _process_caption_data(caption_file=caption_file_dir_fn('val'),
                                    image_dir=image_dir_fn('val'), 
                                    max_length=max_length)

# about 4000 images and 20000 captions for val / test dataset
val_cutoff = int(0.1 * len(val_dataset))
test_cutoff = int(0.2 * len(val_dataset))
print ('Finished processing caption data')

save_pickle(train_dataset, save_annotation_dir_fn('train'))
save_pickle(val_dataset[:val_cutoff], save_annotation_dir_fn('val'))
save_pickle(val_dataset[val_cutoff:test_cutoff].reset_index(drop=True), save_annotation_dir_fn('test'))

for split in ['train', 'val', 'test']:
    annotations = load_pickle(save_annotation_dir_fn(split))

    if split == 'train':
        word_to_idx = originalPrepro._build_vocab(annotations=annotations, threshold=word_count_threshold)
        assert split is 'train'
        save_pickle(word_to_idx, save_word_to_idx)

    captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length)
    save_pickle(captions, save_captions_dir_fn(split))

    file_names, id_to_idx = _build_file_names(annotations)
    save_pickle(file_names, save_file_names_dir_fn(split))

    image_idxs = _build_image_idxs(annotations, id_to_idx)
    save_pickle(image_idxs, save_image_idxs_dir_fn(split))

    # prepare reference captions to compute bleu scores later
    image_ids = {}
    feature_to_captions = {}
    i = -1
    for caption, image_id in zip(annotations['caption'], annotations['image_id']):
        if not image_id in image_ids:
            image_ids[image_id] = 0
            i += 1
            feature_to_captions[i] = []
        feature_to_captions[i].append(caption.lower() + ' .')
    save_pickle(feature_to_captions, save_feature_to_captions_dir_fn(split))
    print("Finished building %s caption dataset" %split)

Finished processing caption data
Saved /root/DLResearch/show-attend-and-tell/data/train/train.annotations.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/val/val.annotations.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/test/test.annotations.pkl..
Loaded /root/DLResearch/show-attend-and-tell/data/train/train.annotations.pkl..
Filtered 23107 words to 23107 words with word count threshold 1.
Max length of caption:  15
Saved /root/DLResearch/show-attend-and-tell/data/train/word_to_idx.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/train/train.captions.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/train/train.file.names.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/train/train.image.idxs.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/train/train.references.pkl..
Finished building train caption dataset
Loaded /root/DLResearch/show-attend-and-tell/data/val/val.annotations.pkl..
Saved /root/DLResearch/show-attend-and-tell/data/val/val.captions

### `_vgg_features()`

In [2]:
import sys
sys.path.append('/root/DLResearch/show-attend-and-tell/')
from core.vggnet import Vgg19
import tensorflow as tf

def _vgg_features(vgg_path):
    vggnet = Vgg19(vgg_path)
    vggnet.build()
    
    
    

In [5]:
%%time
_vgg_model_path = '/root/DLResearch/show-attend-and-tell/data/imagenet-vgg-verydeep-19.mat'
vggnet = Vgg19(_vgg_model_path)
vggnet.build()

CPU times: user 9.29 s, sys: 612 ms, total: 9.9 s
Wall time: 11 s


In [None]:
anno_path = save_annotation_dir_fn('train')