# Visual Genome Dataset Cleansing

Loading dataset
-----------

- To read data from the JSON file and remove the annotations with wrong spelling.


In [2]:
import numpy as np
import math
import sys

sys.path.append('./python_code')
import preprocessing_data as imported_data

image data length: 108077
relationship data length: 108077
0 images processed, 41 relationships
Unspecified Error
2112
21
(2112, 21): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
22
(2112, 22): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
23
(2112, 23): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
3780
53
(3780, 53): [chair]-[]-[outside]

Error: [chair]-[]-[outside]

5000 images processed, 91320 relationships
Unspecified Error
6768
22
(6768, 22): [sky]-[]-[above]

Error: [sky]-[]-[above]

Unspecified Error
6839
15
(6839, 15): [is]-[]-[sidewalk]

Error: [is]-[]-[sidewalk]

Unspecified Error
6993
10
(6993, 10): [laptop]-[]-[open]

Error: [laptop]-[]-[open]

Unspecified Error
7023
21
(7023, 21): [box]-[]-[pizza hut]

Error: [box]-[]-[pizza hut]

Unspecified Error
7024
7
(7024, 7): [alcove]-[]-[open]

Error: [alcove]-[]-[open]

Unspecified Error
9015
31
(9015, 31): [girl]-[]-[outside]

Error: [girl]-[]-[outside]

Unspecified Error
9

# Filtering the Top-50 predicate and Top-150 object categories

In [1]:
def get_object_predicate_list(relationships):
    object_list = set()
    predicate_list = set()
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            object_list.add(relationship_item['subject'])
            object_list.add(relationship_item['object'])
            predicate_list.add(relationship_item['predicate'])
            
    print "Object classes: %d\nPredicate classes: %d\n" % (len(object_list), len(predicate_list))
    return object_list, predicate_list
    
def object_predicate_categories_count(relationships):
    object_list, predicate_list = get_object_predicate_list(relationships)
    freq_obj = {x:0 for x in object_list}
    freq_pred = {x:0 for x in predicate_list}
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            freq_obj[temp_sub] += 1    
            freq_obj[temp_obj] += 1
            freq_pred[temp_predicate] += 1    
    return freq_obj, freq_pred

def filter_categories(original_relationships, top_n_obj=150, top_n_pred=50):
    import copy
    original_im_num = len(original_relationships.keys())
    relationships = copy.deepcopy(original_relationships)
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    import operator
    sorted_obj = sorted(freq_obj.items(), key=operator.itemgetter(1), reverse=True)
    sorted_pred = sorted(freq_pred.items(), key=operator.itemgetter(1), reverse=True)
    selected_obj = set(x[0] for x in sorted_obj[:top_n_obj])
    selected_pred = set(x[0] for x in sorted_pred[:top_n_pred])
    for im_id in relationships.keys():
        for relationship_item in list(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            if temp_sub not in selected_obj or temp_obj not in selected_obj or temp_predicate not in selected_pred:
                relationships[im_id]['relationships'].remove(relationship_item)
        if len(relationships[im_id]['relationships']) == 0:
            relationships.pop(im_id)
    print "%d images left(%d/%d removed)" % \
        (len(relationships.keys()), original_im_num - len(relationships.keys()), original_im_num)
         
    return relationships, selected_obj, selected_pred

def count_instance_num(relationships):
    instance_counter = 0
    for im_id in relationships.keys():
        instance_counter+= len(relationships[im_id]['relationships'])  
    return instance_counter

def im_with_N_more_instances(relationships, N = 15):
    id_list = []
    for im_id in relationships.keys():
        if len(relationships[im_id]['relationships']) >= N:
            id_list.append(im_id)
    print "%d images with more than %d instances" % (len(id_list), N)
    return id_list

def select_top_N_ims(relationships, N):
    import operator
    instance_num = [(x, len(relationships[x]['relationships'])) for x in relationships]
    id_list = sorted(instance_num, key=operator.itemgetter(1), reverse=True)
    assert N<=len(id_list), "Select %d images from %d" % (N, len(id_list))
    return [x[0] for x in id_list[:N]]
    

## Now filter out the long-tail categories

In [3]:
relationships, selected_obj, selected_pred = filter_categories(imported_data.relationships)
freq_obj, freq_pred = object_predicate_categories_count(relationships)

Object classes: 48597
Predicate classes: 28987

95999 images left(8691/104690 removed)
Object classes: 150
Predicate classes: 50



In [26]:
def count_training_weight(relationships):
    import numpy as np
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    total_counter_obj = 0
    total_counter_pred = 0
    inverse_weight_pred = {}
    inverse_weight_obj = {}
    print '========= counting result ========'
    print '------------ object --------------'
    for c in freq_obj:
        print '[{}]: {}'.format(c, freq_obj[c])
        total_counter_obj += freq_obj[c]
    print '----------- predicate ------------'
    for c in freq_pred:
        print '[{}]: {}'.format(c, freq_pred[c])
        total_counter_pred+= freq_pred[c]
    print '========= counting result ========'
    print '------------ object --------------'
    for c in freq_obj:
        print '[{}]: {}'.format(c, np.sqrt(total_counter_obj / freq_obj[c] / 150.))
        inverse_weight_obj[c] = np.sqrt(total_counter_obj / freq_obj[c] / 150.)
    print '----------- predicate ------------'
    for c in freq_pred:
        print '[{}]: {}'.format(c, np.sqrt(total_counter_pred / freq_pred[c]  / 50.))
        inverse_weight_pred[c] = np.sqrt(total_counter_pred / freq_pred[c] / 50.)
    
    return inverse_weight_obj, inverse_weight_pred
    
def output_inverse_weight(filename, inverse_weight_obj, inverse_weight_pred):
    import json
    inverse_weight = {'object': inverse_weight_obj, 'predicate': inverse_weight_pred}
    with open(filename, 'w') as fp:
        json.dump(inverse_weight, fp)

## Partition dataset

1. subsample 25000 images for testing;
2. remaining 70999 images for training;
3. Different training/testing settings
    - original dataset: 70999 images for training and 25000 images for testing
    - Small dataset: 15000 images for training and 5000 for testing
    - Fat Dataset： 15000 images with 15+ instances for training and 5000 for testing

In [27]:
import random
im_list_full = relationships.keys()
random.shuffle(im_list_full)
training_list = im_list_full[:-25000]
testing_list = im_list_full[-25000:]
training_set = {x:relationships[x] for x in training_list}
testing_set = {x:relationships[x] for x in testing_list}
training_small_list = training_list[:15000]
testing_small_list = testing_list[:5000]
training_fat_list = select_top_N_ims(training_set, 15000)

## Loading region description dataset

In [28]:
import json
region_data = json.load(open('../region_descriptions_v2.json'))

In [29]:
def object_matching(objects, rel):
    obj_item = {}
    obj_item['class'] = rel['object']
    obj_item['box'] = rel['obj_box']
    sub_item = {}
    sub_item['class'] = rel['subject']
    sub_item['box'] = rel['sub_box']
    
    obj_id = -1
    sub_id = -1
    
    for idx, obj in enumerate(objects):
        if obj_item == obj:
            obj_id = idx
        if sub_item == obj:
            sub_id = idx
    if sub_id < 0 or obj_id < 0:
        raise Exception('Object Matching Error!')
        
    return sub_id, obj_id

def check_is_included(objects, obj_item):
    for ex_obj in objects:
        if ex_obj == obj_item:
            return True
    return False

def dataset_setup(id_list, relationships, raw_region_data):
    import copy
    import string
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    dataset = []
    ignore_counter_out = 0
    ignore_counter_small = 0
    ignore_counter_short = 0
    ignore_counter_object_small = 0
    region_counter = 0
    region_min_num = 1000
    rel_counter = 0
    for im_id in id_list:
        im_item = {}
        im_item['id'] = im_id
        im_item['path'] = relationships[im_id]['path']
        im_item['height'] = relationships[im_id]['height']
        im_item['width'] = relationships[im_id]['width']
        regions = []
        objects = []
        relationships_new = []
        for region in raw_region_data[im_id]['regions']:
            region_item = {}
            # print region['phrase']
            region_item['phrase'] = region['phrase'].encode('ascii', 'ignore').lower().translate(replace_punctuation)
            # index starts from 
            region_item['box'] = (region['x'] , region['y'] , \
                                  region['x'] + region['width'] - 1, \
                                  region['y'] + region['height'] - 1)
            if region_item['box'][0] < 0 or region_item['box'][1] < 0 or region_item['box'][2] >= im_item['width'] or region_item['box'][3] >= im_item['height']:
#                 print 'ignore regions: (%d %d %d %d) in image(%d %d)' % \
#                         (region_item['box'][0], region_item['box'][1], region_item['box'][2], region_item['box'][3], im_item['width'], im_item['height'])
                ignore_counter_out += 1
            elif region_item['box'][3] - region_item['box'][1] < 64 or region_item['box'][2] - region_item['box'][0] < 64:
                ignore_counter_small += 1
#                 print 'Region too small: (%d %d %d %d)' % \
#                     (region_item['box'][0], region_item['box'][1], region_item['box'][2], region_item['box'][3])
            elif len(region_item['phrase'].split()) < 3:
                ignore_counter_short += 1
            else:
                region_counter+=1
                regions.append(region_item)
        if len(regions) < region_min_num:
            region_min_num = len(regions)
        for rel in relationships[im_id]['relationships']:
            obj_item = {}
            obj_item['class'] = rel['object']
            obj_item['box'] = rel['obj_box']
            if obj_item['box'][0] < 0 or obj_item['box'][1] < 0 or obj_item['box'][2] >= im_item['width'] or obj_item['box'][3] >= im_item['height']:
#                 print 'ignore relationship: (%d %d %d %d) in image(%d %d)' % \
#                         (obj_item['box'][0], obj_item['box'][1], obj_item['box'][2], obj_item['box'][3], im_item['width'], im_item['height'])
                continue
            elif obj_item['box'][3] - obj_item['box'][1] < 32 or obj_item['box'][2] - obj_item['box'][0] < 32:
                ignore_counter_object_small += 1
                continue 
#             obj_item['box'][2] = min(obj_item['box'][2], im_item['width'] - 1)
#             obj_item['box'][3] = min(obj_item['box'][3], im_item['height'] - 1)
            sub_item = {}
            sub_item['class'] = rel['subject']
            sub_item['box'] = rel['sub_box']
            if sub_item['box'][0] < 0 or sub_item['box'][1] < 0 or sub_item['box'][2] >= im_item['width'] or sub_item['box'][3] >= im_item['height']:
#                 print 'ignore relationship: (%d %d %d %d) in image(%d %d)'% \
#                         (sub_item['box'][0], sub_item['box'][1], sub_item['box'][2], sub_item['box'][3], im_item['width'], im_item['height'])
                continue
            elif sub_item['box'][3] - sub_item['box'][1] < 16 or sub_item['box'][2] - sub_item['box'][0] < 16:
                ignore_counter_object_small += 1
                continue 
            
#             sub_item['box'][2] = min(sub_item['box'][2], im_item['width'] - 1)
#             sub_item['box'][3] = min(sub_item['box'][3], im_item['height'] - 1)
            rel_item = {}
            if not check_is_included(objects, obj_item):
                objects.append(obj_item)
            if not check_is_included(objects, sub_item):
                objects.append(sub_item)    
            rel_item['sub_id'], rel_item['obj_id'] = object_matching(objects, rel)
            rel_item['predicate'] = rel['predicate']
            rel_counter+=1
            relationships_new.append(rel_item)
        if len(relationships_new) > 0 and len(regions) > 0:
            im_item['regions'] = regions
            im_item['objects'] = objects
            im_item['relationships'] = relationships_new
            dataset.append(im_item)
        
    print 'Ignore %d (small) and %d (out) and %d (short) regions... Remaining: %d regions (%.2f/im, min: %d)' % \
        (ignore_counter_small, ignore_counter_out, ignore_counter_short, region_counter, region_counter / len(id_list), region_min_num)
    print 'Ignore %d (small) relationships... Remaining: %d relationships (%.2f/im)' % \
        (ignore_counter_object_small, rel_counter, rel_counter / len(id_list))
    print 'Total: %d images (%d filtered)' % (len(dataset), len(id_list) - len(dataset))
    return dataset

def output_dataset(dataset, filename):
    import json
    with open(filename, 'w') as fp:
        json.dump(dataset, fp)
    print "Done output dataset: "+filename
    
def output_category_list(selected_obj, selected_pred, filename):
    categories = {'object': list(selected_obj), 'predicate': list(selected_pred)}
    
    import json
    with open(filename, 'w') as fp:
        json.dump(categories, fp)
    print "Done output category list: " + filename

In [30]:
training_small_set = dataset_setup(training_small_list, relationships, region_data)
testing_small_set = dataset_setup(testing_small_list, relationships, region_data)
training_set = dataset_setup(training_list, relationships, region_data)
testing_set = dataset_setup(testing_list, relationships, region_data)
training_fat_set = dataset_setup(training_fat_list, relationships, region_data)

Ignore 396496 (small) and 749 (out) and 4938 (short) regions... Remaining: 350336 regions (23.00/im, min: 0)
Ignore 36935 (small) relationships... Remaining: 101855 relationships (6.00/im)
Total: 14458 images (542 filtered)
Ignore 132273 (small) and 285 (out) and 1627 (short) regions... Remaining: 116290 regions (23.00/im, min: 1)
Ignore 12199 (small) relationships... Remaining: 33737 relationships (6.00/im)
Total: 4826 images (174 filtered)
Ignore 1876611 (small) and 3562 (out) and 22831 (short) regions... Remaining: 1648558 regions (23.00/im, min: 0)
Ignore 174199 (small) relationships... Remaining: 479861 relationships (6.00/im)
Total: 68446 images (2553 filtered)
Ignore 659387 (small) and 1222 (out) and 8138 (short) regions... Remaining: 581629 regions (23.00/im, min: 0)
Ignore 61631 (small) relationships... Remaining: 168375 relationships (6.00/im)
Total: 24126 images (874 filtered)
Ignore 415995 (small) and 647 (out) and 4339 (short) regions... Remaining: 335874 regions (22.00/im

In [32]:
output_dir = 'output/top_150_50_big/'
output_dataset(training_set, output_dir+'train.json')
output_dataset(testing_set, output_dir+'test.json')
output_dataset(training_small_set, output_dir+'train_small.json')
output_dataset(testing_small_set, output_dir+'test_small.json')
output_dataset(training_fat_set, output_dir+'train_fat.json')
output_category_list(selected_obj, selected_pred, output_dir + 'categories.json')
inverse_weight_obj, inverse_weight_pred = count_training_weight(relationships)
output_inverse_weight(output_dir + 'inverse_weight.json', inverse_weight_obj, inverse_weight_pred)

Done output dataset: output/top_150_50_big/train.json
Done output dataset: output/top_150_50_big/test.json
Done output dataset: output/top_150_50_big/train_small.json
Done output dataset: output/top_150_50_big/test_small.json
Done output dataset: output/top_150_50_big/train_fat.json
Done output category list: output/top_150_50_big/categories.json
Object classes: 150
Predicate classes: 50

------------ object --------------
[bush]: 4869
[kite]: 5735
[pant]: 11073
[laptop]: 4005
[paper]: 3494
[shoe]: 10701
[chair]: 8975
[ground]: 24742
[tire]: 5553
[cup]: 3511
[sky]: 33961
[bench]: 9558
[tail]: 11370
[bike]: 7198
[board]: 3931
[hat]: 13916
[skier]: 3803
[plate]: 15009
[woman]: 46337
[handle]: 4007
[food]: 6413
[bear]: 10177
[wave]: 5988
[giraffe]: 20126
[background]: 7082
[desk]: 3735
[foot]: 7594
[shadow]: 13425
[lady]: 3088
[button]: 2240
[glove]: 4576
[bag]: 5955
[sand]: 5744
[nose]: 10954
[rock]: 8633
[sidewalk]: 10652
[motorcycle]: 8378
[fence]: 9813
[people]: 10349
[house]: 5786
[s

## Buildup dictionary for sentences

In [33]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

def buildup_dictionary(trainset, testset, filename):
    import json
    dictionary = Dictionary()
    print('First, training set...')
    for im in trainset:
        for region in im['regions']:
            words = region['phrase'].split() + ['<eos>']
            for word in words:
                    dictionary.add_word(word)
    print('Then, testing set...')
    for im in testset:
        for region in im['regions']:
            words = region['phrase'].split() + ['<eos>']
            for word in words:
                    dictionary.add_word(word)
                    
    print 'Dictionary has %d words' % len(dictionary.idx2word)
    dict_ = {'word2idx': dictionary.word2idx, 'idx2word': dictionary.idx2word}
    with open(filename, 'w') as fp:
        json.dump(dict_, fp)
    print "Done output category list: " + filename
    return dict_
        

In [34]:
buildup_dictionary(training_set, testing_set, output_dir+'dict.json')

First, training set...
Then, testing set...
Dictionary has 39393 words
Done output category list: output/top_150_50_big/dict.json


{'idx2word': ['the',
  'man',
  'is',
  'laughing',
  '<eos>',
  'people',
  'are',
  'riding',
  'on',
  'bicycles',
  'buildings',
  'in',
  'background',
  'tall',
  'has',
  'a',
  'brown',
  'jacket',
  'black',
  'and',
  'tan',
  'backpack',
  'person',
  'bicycle',
  'wearing',
  'sunglasses',
  'hat',
  'helmet',
  'woman',
  'bag',
  'red',
  'maroon',
  'orange',
  'pant',
  'sits',
  'bike',
  'blue',
  'rides',
  'holds',
  'drink',
  'his',
  'hand',
  'sky',
  'scraper',
  'hats',
  'bench',
  'colorful',
  'dark',
  'photo',
  'light',
  'ground',
  'rim',
  'of',
  'green',
  'pruple',
  'next',
  'to',
  'one',
  'top',
  'rainbow',
  'under',
  'ranbow',
  'colored',
  'two',
  'six',
  'many',
  'color',
  'kept',
  'sitting',
  'made',
  'with',
  'steel',
  'near',
  'coated',
  'arranged',
  'same',
  'row',
  'table',
  'corner',
  'there',
  '6',
  'puffy',
  'clouds',
  'white',
  'plane',
  'tail',
  'grey',
  'engine',
  'silver',
  'lights',
  'pole',
  'wh

In [37]:
import json
import numpy as np
import torch
data = json.load(open(output_dir+'inverse_weight.json'))
weight = torch.zeros(151)
for idx, k in enumerate(data['object'].keys()):
    weight[idx + 1] = data['object'][k]


In [40]:
print data['object'].keys()

[u'kite', u'pant', u'laptop', u'paper', u'shoe', u'chair', u'ground', u'tire', u'cup', u'sky', u'bench', u'tail', u'bike', u'board', u'hat', u'skier', u'plate', u'woman', u'handle', u'food', u'bear', u'wave', u'giraffe', u'background', u'desk', u'foot', u'shadow', u'lady', u'button', u'glove', u'bag', u'sand', u'nose', u'rock', u'sidewalk', u'motorcycle', u'fence', u'people', u'house', u'sign', u'hair', u'street', u'bed', u'mirror', u'logo', u'girl', u'arm', u'flower', u'leaf', u'clock', u'dirt', u'hill', u'bird', u'umbrella', u'leg', u'reflection', u'water', u'sink', u'trunk', u'post', u'tower', u'box', u'boy', u'cow', u'skateboard', u'pillow', u'road', u'wall', u'number', u'pole', u'table', u'boat', u'sheep', u'horse', u'eye', u'top', u'bush', u'window', u'vehicle', u'brick', u'banana', u'ceiling', u'door', u'shelf', u'glass', u'cloud', u'train', u'child', u'line', u'ear', u'neck', u'ski', u'cap', u'tree', u'roof', u'cat', u'cake', u'grass', u'zebra', u'toilet', u'head', u'bus', u'pl

In [39]:
weight


 0.0000
 1.4329
 1.0296
 1.7146
 1.8367
 1.0488
 1.1460
 0.6880
 1.4560
 1.8312
 0.5888
 1.1075
 1.0165
 1.2780
 1.7301
 0.9201
 1.7588
 0.8832
 0.5033
 1.7146
 1.3540
 1.0739
 1.4024
 0.7616
 1.2884
 1.7758
 1.2437
 0.9345
 1.9528
 2.2935
 1.6042
 1.4048
 1.4306
 1.0360
 1.1662
 1.0488
 1.1860
 1.0954
 1.0646
 1.4259
 0.7165
 0.8524
 0.9201
 1.2517
 1.3013
 1.6021
 0.9381
 1.2675
 1.1015
 0.8206
 0.9557
 1.5853
 1.6166
 1.0296
 1.2356
 0.7257
 1.5362
 0.6782
 2.1323
 1.5055
 1.6693
 1.5663
 1.7645
 0.8206
 1.0646
 1.0863
 1.3952
 0.8832
 0.6733
 1.3441
 0.8406
 0.6782
 1.0392
 1.2000
 0.9165
 0.8641
 1.7889
 1.5556
 0.4472
 1.8385
 1.7301
 1.7378
 1.7720
 1.0985
 1.4674
 0.9055
 0.6831
 0.6782
 1.4922
 1.1747
 0.8406
 1.7907
 1.5853
 1.5513
 0.5416
 1.6492
 0.8794
 1.9630
 0.7118
 0.8406
 1.9967
 0.6928
 0.8042
 1.4353
 1.3663
 1.2220
 0.8124
 1.2193
 0.5715
 1.2166
 1.5144
 1.7795
 0.8718
 1.7758
 0.8794
 1.9131
 1.6000
 1.5078
 0.8406
 1.3687
 1.0832
 1.5556
 0.9487
 1.3880
 1.6104