In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import json
import argparse
import string
import os.path as osp
import operator
# non-standard dependencies:
import numpy as np
from scipy.misc import imread, imresize

In [None]:
vg_path='../data/visual_genome/'
"""
Our vocabulary will add __background__, COCO categories, <UNK>, PAD, BOS, EOS
"""
# remove bad words, and return final sentences (sent_id -> final)
# count up the number of words
#   count_thr = params['word_count_threshold']

with open(osp.join(vg_path,'scene_graphs_pp.json'), 'r') as f:
    images = json.load(f)
print('sgpp loaded')
with open(osp.join(vg_path, 'image_data_split1000.json'), 'r') as f:
    imgs_info = json.load(f)
    info_dict = {img['image_id']: img for img in imgs_info}

Images = dict()
for img in images:
    info = info_dict[img['image_id']]
    if info['split'] in ['train', 'val']:    
        Images[img['image_id']] = img

In [None]:
special = []
def replace_special(name):
    name = name.lower()
    i = 0
    while i < len(name):
        c = name[i]
        if (c > 'z' or c < 'a') and c != ' ':
            if c not in special: 
                special.append(c)
            name = name[:i] + ' ' + c + ' ' + name[i+1:]
            i += 2
#             print(name)
        i += 1
    return name


def count_words_vg(source='names', Images = Images):
    word2count = {}
    for id, img in Images.items():
        if source in ['names', 'attributes']:
            for obj in img['objects']:
                if source in obj:
                    for string in obj[source]:
                        string = replace_special(string)
                        for wd in string.split():
                            word2count[wd] = word2count.get(wd, 0) + 1
        elif source == 'relationships':
            for rel in img['relationships']:
                string = rel['predicate']
                string = replace_special(string)
                for wd in string.split():
                    word2count[wd] = word2count.get(wd, 0) + 1
        else:
            raise NotImplementedError
    return word2count

def merge_count(count1, count2):
    if len(count1) < len(count2):
        count3 = count1
        count1 = count2
        count2 = count3
    count = dict(count1)
    for w,c in count2.items():
        count[w] = count.get(w, 0) + count2[w]
    return count

def add_count(count1, count2):
    for w, c in count2.items():
        count1[w] = count1.get(w, 0) + count2[w]
    return count1

def build_vocab(word2count, count_thr):
    total_count = sum(word2count.values())
    bad_words = [wd for wd, n in word2count.items() if n <= count_thr]
    bad_count = sum([word2count[wd] for wd in bad_words])
    
    good_words_freq= [i for i in word2count.items() if i[1] > count_thr]
    good_words_freq = sorted(good_words_freq, key=lambda x: -x[1])
    good_words = [x[0] for x in good_words_freq]
    good_freq = [x[1] for x in good_words_freq]
    print(good_words_freq[:10])
    print(good_words[:10])
    print(good_freq[:10])
    print('number of good words: %d' % len(good_words))
    print('number of bad words: %d/%d = %.2f%%' 
          % (len(bad_words), len(word2count), len(bad_words)*100.0/len(word2count)))
    print('number of UNKs in sentences: %d/%d = %.2f%%' 
          % (bad_count, total_count, bad_count*100.0/total_count))

    # add UNK, BOS, EOS, PAD
    vocab = ['<PAD>', '<UNK>', '<BOS>', '<EOS>'] + good_words
    freq = [good_freq[0] * 10] * 4 + good_freq
    return vocab, freq

def build_lookup(included, count_thresh, vg_count=None, rc_count=None, fast_text_path='../data/fast_text/'):
    lookup = {}
    lookup['included'] = included
    count = {}
    if 'vg_names' in included:
        count = add_count(count, vg_count['names'])
    if 'vg_attributes' in included:
        count = add_count(count, vg_count['attributes'])
    if 'vg_relationships' in included:
        count = add_count(count, vg_count['relationships'])
        
    if 'refcoco' in included:
        count = add_count(count, rc_count['refcoco'])
    if 'refcoco+' in included:
        count = add_count(count, rc_count['refcoco+'])
    if 'refcocog' in included:
        count = add_count(count, rc_count['refcocog'])
    
    vocab, freq = build_vocab(count, count_thresh)
    lookup['ix_to_word'] = vocab
    lookup['freq'] = freq
    ft_vocab = np.load(osp.join(fast_text_path, 'vocabulary_ft.npy'))
    ft_vocab = list(ft_vocab)
    ft_embeddings = np.load(osp.join(fast_text_path, 'embeddings_ft.npy'))
    
    embeddings = np.empty((len(vocab), 300))
    for i, w in enumerate(vocab):
        if w in ft_vocab:
            ft_i = ft_vocab.index(w)
            embeddings[i] = ft_embeddings[ft_i]
        else:
            embeddings[i] = np.random.randn(300) / 300.0
            print(i, w, freq[i])
    lookup['embeddings'] = embeddings
    return lookup

In [None]:
vg_name_count = count_words_vg('names')  # full vg: len 26160  sgpp 25972
vg_att_count = count_words_vg('attributes')  # full vg: len 20284 sgpp 20196
vg_rel_count = count_words_vg('relationships')  # full vg: len 7973 sgpp 7940

In [None]:
print(len(vg_name_count))
print(len(vg_att_count))
print(len(vg_rel_count))

In [None]:
import matplotlib.pyplot as plt

def plot_count(word2count):
    count = word2count.values()
    count.sort()
    plt.plot(count)
    return

In [None]:
%matplotlib notebook
plot_count(vg_name_count)
plot_count(vg_att_count)
plot_count(vg_rel_count)

print(len(vg_name_count), len(vg_att_count), len(vg_rel_count))

name_att_count = merge_count(vg_name_count, vg_att_count)
print(len(name_att_count))
vg_count = merge_count(name_att_count, vg_rel_count)
print(len(vg_count))

plot_count(name_att_count)
plot_count(vg_count)

In [None]:
thresh = 10
print('name')
name_vocab, _ = build_vocab(vg_name_count, thresh)
print('\natt')
att_vocab, _ = build_vocab(vg_att_count, thresh)
print('\nrel')
rel_vocab, _ = build_vocab(vg_rel_count, thresh)
print('\nname_att')
name_att_vocab = build_vocab(name_att_count, thresh)
print('\nvg')
vg_vocab, _ = build_vocab(vg_count, thresh)

In [None]:
vg_counts = {'names': vg_name_count,
            'attributes': vg_att_count,
            'relationships': vg_rel_count}

vg_lookup = build_lookup(['vg_names', 'vg_attributes', 'vg_relationships'], 10, vg_counts)

# np.save('../data/fast_text/wordcounts_vgpp_trainval.npy', vg_counts)
np.save('../data/fast_text/lookup_vgpp_trainval.npy', vg_lookup)

In [None]:
refer_dir = osp.join('..', 'pyutils', 'refer')
sys.path.insert(0, refer_dir)
from refer import REFER

def count_words_refcocox(data_root = '../data', dataset='refcoco', splitBy='unc'):
    refer = REFER(data_root, dataset, splitBy)
    sentToTokens = refer.sentToTokens
    # count the number of words
    word2count = {}
    for sent_id, tokens in sentToTokens.items():
        for string in tokens:
            string = replace_special(string)
            for wd in string.split():
                word2count[wd] = word2count.get(wd, 0) + 1
    
    # add category words
    category_names = refer.Cats.values() + ['__background__']
    for cat_name in category_names:
        for wd in cat_name.split():
                word2count[wd] = 1e5
    return word2count

rc_count = count_words_refcocox(dataset='refcoco')
rcp_count = count_words_refcocox(dataset='refcoco+')
rcg_count = count_words_refcocox(dataset='refcocog', splitBy='google')

print(len(rc_count), len(rcp_count), len(rcg_count))


In [None]:
%matplotlib notebook
plot_count(rc_count)
plot_count(rcp_count)
plot_count(rcg_count)

In [None]:
thresh = 10
print('rc')
rc_vocab, _ = build_vocab(rc_count, thresh)
print('\nrc+')
rcp_vocab, _ = build_vocab(rcp_count, thresh)
print('\nrcg')
rcg_vocab, _ = build_vocab(rcg_count, thresh)
print('\nvg_rc')
vg_rc_count = merge_count(vg_count, rc_count)
vg_rc_vocab, _ = build_vocab(vg_rc_count, thresh)
print('\nvg_rc+')
vg_rcp_count = merge_count(vg_count, rcp_count)
vg_rcp_vocab, _ = build_vocab(vg_rcp_count, thresh)
print('\nvg_rcg')
vg_rcg_count = merge_count(vg_count, rcg_count)
vg_rcg_vocab, _ = build_vocab(vg_rcg_count, thresh)
print('\nvg_rc_rc+')
vg_rc_rcp_count = merge_count(vg_rc_count, rcp_count)
vg_rc_rcp_vocab, _ = build_vocab(vg_rc_rcp_count, thresh)
print('\nvg_rcall')
vg_rcall_count = merge_count(vg_rc_rcp_count, rcg_count)
vg_rcall_vocab, _ = build_vocab(vg_rcall_count, thresh)


In [None]:
# 'rca' means ['refcoco', 'refcoco+', 'refcocog']
rca_counts = {'refcoco': rc_count,
              'refcoco+': rcp_count,
              'refcocog': rcg_count}

rca_lookup = build_lookup(['refcoco', 'refcoco+', 'refcocog'], 10, rc_count=rca_counts)

# np.save('../data/fast_text/wordcounts_rca.npy', rca_counts)
np.save('../data/fast_text/lookup_rca.npy', rca_lookup)

vg_rca_lookup = build_lookup(['vg_names', 'vg_attributes', 'vg_relationships', 'refcoco', 'refcoco+', 'refcocog'],
                             10, vg_counts, rca_counts)

np.save('../data/fast_text/lookup_vgpp_tv_rca.npy', vg_rca_lookup)



In [None]:
lookup = np.load('../data/fast_text/lookup_vgpp_tv_rca.npy')
lookup.item()

In [None]:
refer_dir = osp.join('..', 'pyutils', 'refer')
sys.path.insert(0, refer_dir)
from refer import REFER

refer = REFER(data_root='../data', dataset='refcoco', splitBy='unc')
category_names = refer.Cats.values()
print(category_names)

In [None]:
with open('../data/visual_genome/name_att_rel_count_pp.json', 'r') as f:
    count_info = json.load(f)


In [None]:
c = sorted(count_info['name'].items(), key=lambda x: -x[1])
print(c[:100])


In [None]:
print([i[1] for i in c[:500]])