# Visual Genome Dataset Cleansing

Loading dataset
-----------

- To read data from the JSON file and remove the annotations with wrong spelling.


In [1]:
import numpy as np
import math
import sys

sys.path.append('./python_code')
import preprocessing_data as imported_data

Loading image data...
image data length: 108077
Loading Relationship data...
relationship data length: 108077
Loading caption dataset...
caption dataset length: 108077
Unspecified Error
3780
53
(3780, 53): [chair]-[]-[outside]

Error: [chair]-[]-[outside]

5000 images processed, 56608 relationships
Unspecified Error
6768
22
(6768, 22): [sky]-[]-[above]

Error: [sky]-[]-[above]

Unspecified Error
9782
4
(9782, 4): [man]-[]-[sleeping]

Error: [man]-[]-[sleeping]

Unspecified Error
9834
27
(9834, 27): [man]-[]-[bending over]

Error: [man]-[]-[bending over]

10000 images processed, 86124 relationships
Unspecified Error
14024
53
(14024, 53): [ear]-[]-[side of head]

Error: [ear]-[]-[side of head]

Unspecified Error
14024
54
(14024, 54): [ear]-[]-[side of head]

Error: [ear]-[]-[side of head]

15000 images processed, 203671 relationships
20000 images processed, 309562 relationships
25000 images processed, 399991 relationships
Unspecified Error
25565
36
(25565, 36): [the]-[]-[sign]

Error: [t

# Filtering the Top-50 predicate and Top-150 object categories

In [2]:
def get_object_predicate_list(relationships):
    object_list = set()
    predicate_list = set()
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            object_list.add(relationship_item['subject'])
            object_list.add(relationship_item['object'])
            predicate_list.add(relationship_item['predicate'])
            
    print "Object classes: %d\nPredicate classes: %d\n" % (len(object_list), len(predicate_list))
    return object_list, predicate_list
    
def object_predicate_categories_count(relationships):
    object_list, predicate_list = get_object_predicate_list(relationships)
    freq_obj = {x:0 for x in object_list}
    freq_pred = {x:0 for x in predicate_list}
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            freq_obj[temp_sub] += 1    
            freq_obj[temp_obj] += 1
            freq_pred[temp_predicate] += 1    
    return freq_obj, freq_pred

def filter_categories(original_relationships, top_n_obj=150, top_n_pred=50):
    import copy
    original_im_num = len(original_relationships.keys())
    relationships = copy.deepcopy(original_relationships)
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    import operator
    sorted_obj = sorted(freq_obj.items(), key=operator.itemgetter(1), reverse=True)
    sorted_pred = sorted(freq_pred.items(), key=operator.itemgetter(1), reverse=True)
    selected_obj = set(x[0] for x in sorted_obj[:top_n_obj])
    selected_pred = set(x[0] for x in sorted_pred[:top_n_pred])
    for im_id in relationships.keys():
        for relationship_item in list(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            if temp_sub not in selected_obj or temp_obj not in selected_obj or temp_predicate not in selected_pred:
                relationships[im_id]['relationships'].remove(relationship_item)
        if len(relationships[im_id]['relationships']) < 5:
            relationships.pop(im_id)
    print "%d images left(%d/%d removed)" % \
        (len(relationships.keys()), original_im_num - len(relationships.keys()), original_im_num)
         
    return relationships, selected_obj, selected_pred

def count_instance_num(relationships):
    instance_counter = 0
    for im_id in relationships.keys():
        instance_counter+= len(relationships[im_id]['relationships'])  
    return instance_counter

def im_with_N_more_instances(relationships, N = 15):
    id_list = []
    for im_id in relationships.keys():
        if len(relationships[im_id]['relationships']) >= N:
            id_list.append(im_id)
    print "%d images with more than %d instances" % (len(id_list), N)
    return id_list

def select_top_N_ims(relationships, N):
    import operator
    instance_num = [(x, len(relationships[x]['relationships'])) for x in relationships]
    id_list = sorted(instance_num, key=operator.itemgetter(1), reverse=True)
    assert N<=len(id_list), "Select %d images from %d" % (N, len(id_list))
    return [x[0] for x in id_list[:N]]
    

## Now filter out the long-tail categories

In [4]:
relationships, selected_obj, selected_pred = filter_categories(imported_data.data)
freq_obj, freq_pred = object_predicate_categories_count(relationships)

Object classes: 41984
Predicate classes: 24126

56164 images left(30015/86179 removed)
Object classes: 150
Predicate classes: 50



In [5]:
def count_training_weight(relationships):
    import numpy as np
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    total_counter_obj = 0
    total_counter_pred = 0
    inverse_weight_pred = {}
    inverse_weight_obj = {}
    print '========= counting result ========'
    print '------------ object --------------'
    for c in freq_obj:
        print '[{}]: {}'.format(c, freq_obj[c])
        total_counter_obj += freq_obj[c]
    print '----------- predicate ------------'
    for c in freq_pred:
        print '[{}]: {}'.format(c, freq_pred[c])
        total_counter_pred+= freq_pred[c]
    print '========= counting result ========'
    print '------------ object --------------'
    for c in freq_obj:
        print '[{}]: {}'.format(c, np.sqrt(total_counter_obj / freq_obj[c] / 150.))
        inverse_weight_obj[c] = np.sqrt(total_counter_obj / freq_obj[c] / 150.)
    print '----------- predicate ------------'
    for c in freq_pred:
        print '[{}]: {}'.format(c, np.sqrt(total_counter_pred / freq_pred[c]  / 50.))
        inverse_weight_pred[c] = np.sqrt(total_counter_pred / freq_pred[c] / 50.)
    
    return inverse_weight_obj, inverse_weight_pred
    
def output_inverse_weight(filename, inverse_weight_obj, inverse_weight_pred):
    import json
    inverse_weight = {'object': inverse_weight_obj, 'predicate': inverse_weight_pred}
    with open(filename, 'w') as fp:
        json.dump(inverse_weight, fp)

## Partition dataset

1. subsample 25000 images for testing;
2. remaining 70999 images for training;
3. Different training/testing settings
    - original dataset: 70999 images for training and 25000 images for testing
    - Small dataset: 15000 images for training and 5000 for testing
    - Fat Dataset： 15000 images with 15+ instances for training and 5000 for testing

In [9]:
import random
im_list_full = relationships.keys()
random.shuffle(im_list_full)
training_list = im_list_full[:-10000]
testing_list = im_list_full[-10000:]
training_set = {x:relationships[x] for x in training_list}
testing_set = {x:relationships[x] for x in testing_list}
training_small_list = training_list[:15000]
testing_small_list = testing_list[:5000]
training_fat_list = select_top_N_ims(training_set, 15000)

In [16]:
relationships[2282]

{'height': 600,
 'image_id': 2283,
 'path': '2283.jpg',
 'regions': [{'box': (322, 237, 393, 321),
   'phrase': ['tail', 'of', 'the', 'squirrel']},
  {'box': (620, 2, 721, 215),
   'phrase': ['a', 'black', 'post', 'on', 'the', 'sidewalk']},
  {'box': (6, 4, 102, 365), 'phrase': ['trunk', 'of', 'a', 'tree']},
  {'box': (50, 10, 322, 77),
   'phrase': ['wooden', 'benches', 'in', 'a', 'park']},
  {'box': (368, 8, 440, 167),
   'phrase': ['a', 'black', 'post', 'to', 'the', 'fence']},
  {'box': (620, 133, 705, 220), 'phrase': ['base', 'of', 'a', 'post']},
  {'box': (104, 372, 533, 545), 'phrase': ['grass', 'that', 'is', 'lush']},
  {'box': (0, 0, 73, 320), 'phrase': ['bark', 'on', 'tree', 'trunk']},
  {'box': (5, 178, 793, 596),
   'phrase': ['green', 'grass', 'with', 'bare', 'patches']},
  {'box': (623, 2, 707, 198), 'phrase': ['base', 'of', 'lamp', 'post']},
  {'box': (42, 3, 535, 71), 'phrase': ['row', 'of', 'park', 'benches']},
  {'box': (337, 3, 425, 87),
   'phrase': ['legs', 'of', 't

## Loading region description dataset

In [18]:
def object_matching(objects, rel):
    obj_item = {}
    obj_item['class'] = rel['object']
    obj_item['box'] = rel['obj_box']
    sub_item = {}
    sub_item['class'] = rel['subject']
    sub_item['box'] = rel['sub_box']
    
    obj_id = -1
    sub_id = -1
    
    for idx, obj in enumerate(objects):
        if obj_item == obj:
            obj_id = idx
        if sub_item == obj:
            sub_id = idx
    if sub_id < 0 or obj_id < 0:
        raise Exception('Object Matching Error!')
        
    return sub_id, obj_id

def check_is_included(objects, obj_item):
    for ex_obj in objects:
        if ex_obj == obj_item:
            return True
    return False

def dataset_setup(id_list, relationships):
    import copy
    import string
    import nltk
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    dataset = []
    ignore_counter_out = 0
    ignore_counter_small = 0
    ignore_counter_short = 0
    ignore_counter_object_small = 0
    region_counter = 0
    region_min_num = 1000
    rel_counter = 0
    for im_id in id_list:
        im_item = {}
        im_item['id'] = im_id
        im_item['path'] = relationships[im_id]['path']
        im_item['height'] = relationships[im_id]['height']
        im_item['width'] = relationships[im_id]['width']
        regions = relationships[im_id]['regions']
        objects = []
        relationships_new = []
        if len(regions) < region_min_num:
            region_min_num = len(regions)
        for rel in relationships[im_id]['relationships']:
            obj_item = {}
            obj_item['class'] = rel['object']
            obj_item['box'] = rel['obj_box']
            sub_item = {}
            sub_item['class'] = rel['subject']
            sub_item['box'] = rel['sub_box']
            rel_item = {}
            if not check_is_included(objects, obj_item):
                objects.append(obj_item)
            if not check_is_included(objects, sub_item):
                objects.append(sub_item)    
            rel_item['sub_id'], rel_item['obj_id'] = object_matching(objects, rel)
            rel_item['predicate'] = rel['predicate']
            rel_counter+=1
            relationships_new.append(rel_item)
        if len(relationships_new) >= 5 and len(regions) >= 20 and len(regions) <= 50:
            im_item['regions'] = regions
            im_item['objects'] = objects
            im_item['relationships'] = relationships_new
            dataset.append(im_item)
        else:
            print 'Image ignored: relationship: {}, regions: {}'.format(
                len(relationships_new), len(regions))
            
    print 'Total: %d images (%d filtered)' % (len(dataset), len(id_list) - len(dataset))
    return dataset

def output_dataset(dataset, filename):
    import json
    with open(filename, 'w') as fp:
        json.dump(dataset, fp)
    print "Done output dataset: "+filename
    
def output_category_list(selected_obj, selected_pred, filename):
    categories = {'object': list(selected_obj), 'predicate': list(selected_pred)}
    
    import json
    with open(filename, 'w') as fp:
        json.dump(categories, fp)
    print "Done output category list: " + filename

In [19]:
training_small_set = dataset_setup(training_small_list, relationships)
testing_small_set = dataset_setup(testing_small_list, relationships)
training_set = dataset_setup(training_list, relationships)
testing_set = dataset_setup(testing_list, relationships)
training_fat_set = dataset_setup(training_fat_list, relationships)

Total: 15000 images (0 filtered)
Total: 5000 images (0 filtered)
Total: 46164 images (0 filtered)
Total: 10000 images (0 filtered)
Total: 15000 images (0 filtered)


In [20]:
output_dir = 'output/top_150_50_full/'
output_dataset(training_set, output_dir+'train.json')
output_dataset(testing_set, output_dir+'test.json')
output_dataset(training_small_set, output_dir+'train_small.json')
output_dataset(testing_small_set, output_dir+'test_small.json')
output_dataset(training_fat_set, output_dir+'train_fat.json')
output_category_list(selected_obj, selected_pred, output_dir + 'categories.json')
inverse_weight_obj, inverse_weight_pred = count_training_weight(relationships)
output_inverse_weight(output_dir + 'inverse_weight.json', inverse_weight_obj, inverse_weight_pred)

Done output dataset: output/top_150_50_full/train.json
Done output dataset: output/top_150_50_full/test.json
Done output dataset: output/top_150_50_full/train_small.json
Done output dataset: output/top_150_50_full/test_small.json
Done output dataset: output/top_150_50_full/train_fat.json
Done output category list: output/top_150_50_full/categories.json
Object classes: 150
Predicate classes: 50

------------ object --------------
[bush]: 3591
[kite]: 3890
[pant]: 8796
[laptop]: 3005
[paper]: 2419
[motorcycle]: 6235
[chair]: 6499
[ground]: 17819
[tire]: 3919
[cup]: 2271
[sky]: 26605
[bench]: 7653
[tail]: 8354
[bike]: 5691
[board]: 2840
[orange]: 2099
[hat]: 8831
[skier]: 2870
[plate]: 11661
[woman]: 35882
[handle]: 2310
[branch]: 4005
[food]: 4259
[bear]: 7098
[vase]: 4696
[giraffe]: 13238
[background]: 5416
[desk]: 2352
[foot]: 5109
[shadow]: 10168
[lady]: 2389
[glove]: 3199
[bag]: 4241
[sand]: 4360
[nose]: 6306
[rock]: 5845
[tower]: 3300
[shoe]: 7574
[fence]: 7686
[people]: 7761
[house

## Buildup dictionary for sentences

In [49]:
# class Dictionary(object):
#     def __init__(self):
#         self.word2idx = {}
#         self.idx2word = []
#         self.word_freq = {}

#     def add_word(self, word):
#         if word not in self.word2idx:
#             self.idx2word.append(word)
#             self.word2idx[word] = len(self.idx2word) - 1
#             self.word_freq[word] = len(s)
#         return self.word2idx[word]

#     def __len__(self):
#         return len(self.idx2word)

# def buildup_dictionary(trainset, testset, filename):
#     import json
#     dictionary = Dictionary()
#     print('First, training set...')
#     for im in trainset:
#         for region in im['regions']:
#             words = region['phrase'].split() + ['<eos>']
#             for word in words:
#                     dictionary.add_word(word)
#     print('Then, testing set...')
#     for im in testset:
#         for region in im['regions']:
#             words = region['phrase'].split() + ['<eos>']
#             for word in words:
#                     dictionary.add_word(word)
                    
#     print 'Dictionary has %d words' % len(dictionary.idx2word)
#     dict_ = {'word2idx': dictionary.word2idx, 'idx2word': dictionary.idx2word}
#     with open(filename, 'w') as fp:
#         json.dump(dict_, fp)
#     print "Done output category list: " + filename
#     return dict_


def buildup_dictionary(trainset, testset, filename, freq_thres=10):
    import json
    import nltk
    import itertools
    
    descriptions = []
    for im in trainset:
        for region in im['regions']:
            descriptions.append(region['phrase'])
    for im in testset:
        for region in im['regions']:
            descriptions.append(region['phrase'])
    print 'descriptions.len', len(descriptions)
    word_freq = nltk.FreqDist(itertools.chain(*descriptions))
    vocab = [word for word in word_freq.keys() if word_freq[word] > freq_thres]
    index_to_word = vocab
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
    print 'Vocabulary size: {}'.format(len(vocab))
    dict_ = {'word2idx': word_to_index, 'idx2word': index_to_word}
    with open(filename, 'w') as fp:
        json.dump(dict_, fp)
    print "Done output category list: " + filename
    return dict_, word_freq
        

In [50]:
dictionary, freq = buildup_dictionary(training_set, testing_set, output_dir+'dict.json', freq_thres=10)

descriptions.len 2001747
Vocabulary size: 8380
Done output category list: output/top_150_50_full/dict.json


In [51]:
import json
import numpy as np
import torch
data = json.load(open(output_dir+'inverse_weight.json'))
weight = torch.zeros(151)
for idx, k in enumerate(data['object'].keys()):
    weight[idx + 1] = data['object'][k]


In [52]:
print data['object'].keys()

[u'kite', u'pant', u'laptop', u'paper', u'motorcycle', u'chair', u'ground', u'tire', u'cup', u'sky', u'bench', u'tail', u'bike', u'board', u'orange', u'hat', u'skier', u'plate', u'woman', u'handle', u'branch', u'food', u'bear', u'vase', u'giraffe', u'background', u'desk', u'foot', u'shadow', u'lady', u'glove', u'bag', u'sand', u'nose', u'rock', u'tower', u'shoe', u'fence', u'people', u'house', u'sign', u'hair', u'street', u'bed', u'mirror', u'racket', u'logo', u'girl', u'arm', u'flower', u'leaf', u'clock', u'dirt', u'hill', u'bird', u'umbrella', u'leg', u'reflection', u'bathroom', u'surfer', u'water', u'sink', u'trunk', u'post', u'sidewalk', u'box', u'boy', u'cow', u'skateboard', u'pillow', u'road', u'wall', u'number', u'pole', u'table', u'boat', u'sheep', u'horse', u'eye', u'top', u'bush', u'window', u'vehicle', u'banana', u'fork', u'head', u'door', u'bus', u'phone', u'cloud', u'train', u'child', u'line', u'ear', u'neck', u'ski', u'tree', u'roof', u'cat', u'donut', u'cake', u'grass', 

In [53]:
weight[0] = 100
weight / min(weight)


 327.3268
   4.7660
   3.1623
   5.4182
   6.0415
   3.7607
   3.6839
   2.2200
   4.7434
   6.2336
   1.8127
   3.3912
   3.2514
   3.9370
   5.5742
   6.4862
   3.1623
   5.5485
   2.7516
   1.5584
   6.1818
   4.6904
   4.5513
   3.5254
   4.3342
   2.5774
   4.0356
   6.1296
   4.1576
   2.9399
   6.0769
   5.2509
   4.5591
   4.4960
   3.7417
   3.8822
   5.1686
   3.4122
   3.3806
   3.3700
   4.7809
   2.4640
   2.6726
   2.9520
   3.9279
   4.1490
   5.5485
   5.8979
   2.8785
   3.9279
   3.7129
   2.9399
   3.2733
   5.0990
   5.1339
   3.3912
   3.8452
   2.2200
   4.8844
   6.2735
   5.4968
   2.1381
   6.5846
   4.5748
   5.9221
   3.5153
   5.4182
   2.5213
   3.3488
   3.6645
   4.4481
   2.8031
   2.2991
   5.6125
   2.9032
   2.1876
   3.5254
   3.9005
   2.8410
   3.6450
   6.0710
   4.9570
   1.6036
   6.4642
   5.8737
   6.2336
   2.2039
   3.7033
   2.6592
   5.5032
   2.0874
   2.3452
   4.6291
   4.0444
   2.7255
   5.5356
   5.0780
   1.7113
   5.6125
   2.8031