# Visual Genome Dataset Cleansing

Loading dataset
-----------

- To read data from the JSON file and remove the annotations with wrong spelling.


In [1]:
import numpy as np
import math
import sys

sys.path.append('./python_code')
import preprocessing_data as imported_data

image data length: 108077
relationship data length: 108077
0 images processed, 41 relationships
Unspecified Error
2112
21
(2112, 21): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
22
(2112, 22): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
23
(2112, 23): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
3780
53
(3780, 53): [chair]-[]-[outside]

Error: [chair]-[]-[outside]

5000 images processed, 91320 relationships
Unspecified Error
6768
22
(6768, 22): [sky]-[]-[above]

Error: [sky]-[]-[above]

Unspecified Error
6839
15
(6839, 15): [is]-[]-[sidewalk]

Error: [is]-[]-[sidewalk]

Unspecified Error
6993
10
(6993, 10): [laptop]-[]-[open]

Error: [laptop]-[]-[open]

Unspecified Error
7023
21
(7023, 21): [box]-[]-[pizza hut]

Error: [box]-[]-[pizza hut]

Unspecified Error
7024
7
(7024, 7): [alcove]-[]-[open]

Error: [alcove]-[]-[open]

Unspecified Error
9015
31
(9015, 31): [girl]-[]-[outside]

Error: [girl]-[]-[outside]

Unspecified Error
9

# Filtering the Top-50 predicate and Top-150 object categories

In [2]:
def get_object_predicate_list(relationships):
    object_list = set()
    predicate_list = set()
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            object_list.add(relationship_item['subject'])
            object_list.add(relationship_item['object'])
            predicate_list.add(relationship_item['predicate'])
            
    print "Object classes: %d\nPredicate classes: %d\n" % (len(object_list), len(predicate_list))
    return object_list, predicate_list
    
def object_predicate_categories_count(relationships):
    object_list, predicate_list = get_object_predicate_list(relationships)
    freq_obj = {x:0 for x in object_list}
    freq_pred = {x:0 for x in predicate_list}
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            freq_obj[temp_sub] += 1    
            freq_obj[temp_obj] += 1
            freq_pred[temp_predicate] += 1    
    return freq_obj, freq_pred

def filter_categories(original_relationships, top_n_obj=150, top_n_pred=50):
    import copy
    original_im_num = len(original_relationships.keys())
    relationships = copy.deepcopy(original_relationships)
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    import operator
    sorted_obj = sorted(freq_obj.items(), key=operator.itemgetter(1), reverse=True)
    sorted_pred = sorted(freq_pred.items(), key=operator.itemgetter(1), reverse=True)
    selected_obj = set(x[0] for x in sorted_obj[:top_n_obj])
    selected_pred = set(x[0] for x in sorted_pred[:top_n_pred])
    for im_id in relationships.keys():
        for relationship_item in list(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            if temp_sub not in selected_obj or temp_obj not in selected_obj or temp_predicate not in selected_pred:
                relationships[im_id]['relationships'].remove(relationship_item)
        if len(relationships[im_id]['relationships']) == 0:
            relationships.pop(im_id)
    print "%d images left(%d/%d removed)" % \
        (len(relationships.keys()), original_im_num - len(relationships.keys()), original_im_num)
         
    return relationships, selected_obj, selected_pred

def count_instance_num(relationships):
    instance_counter = 0
    for im_id in relationships.keys():
        instance_counter+= len(relationships[im_id]['relationships'])  
    return instance_counter

def im_with_N_more_instances(relationships, N = 15):
    id_list = []
    for im_id in relationships.keys():
        if len(relationships[im_id]['relationships']) >= N:
            id_list.append(im_id)
    print "%d images with more than %d instances" % (len(id_list), N)
    return id_list

def select_top_N_ims(relationships, N):
    import operator
    instance_num = [(x, len(relationships[x]['relationships'])) for x in relationships]
    id_list = sorted(instance_num, key=operator.itemgetter(1), reverse=True)
    assert N<=len(id_list), "Select %d images from %d" % (N, len(id_list))
    return [x[0] for x in id_list[:N]]
    

## Now filter out the long-tail categories

In [3]:
relationships, selected_obj, selected_pred = filter_categories(imported_data.relationships)

Object classes: 48597
Predicate classes: 28987

95999 images left(8691/104690 removed)


## Partition dataset

1. subsample 25000 images for testing;
2. remaining 70999 images for training;
3. Different training/testing settings
    - original dataset: 70999 images for training and 25000 images for testing
    - Small dataset: 15000 images for training and 5000 for testing
    - Fat Dataset： 15000 images with 15+ instances for training and 5000 for testing

In [4]:
import random
im_list_full = relationships.keys()
random.shuffle(im_list_full)
training_list = im_list_full[:-25000]
testing_list = im_list_full[-25000:]
training_set = {x:relationships[x] for x in training_list}
testing_set = {x:relationships[x] for x in testing_list}
training_small_list = training_list[:15000]
testing_small_list = testing_list[:5000]
training_fat_list = select_top_N_ims(training_set, 15000)

## Loading region description dataset

In [5]:
import json
region_data = json.load(open('../region_descriptions_v2.json'))

In [6]:
def object_matching(objects, rel):
    obj_item = {}
    obj_item['class'] = rel['object']
    obj_item['box'] = rel['obj_box']
    sub_item = {}
    sub_item['class'] = rel['subject']
    sub_item['box'] = rel['sub_box']
    
    obj_id = -1
    sub_id = -1
    
    for idx, obj in enumerate(objects):
        if obj_item == obj:
            obj_id = idx
        if sub_item == obj:
            sub_id = idx
    if sub_id < 0 or obj_id < 0:
        raise Exception('Object Matching Error!')
        
    return sub_id, obj_id

def check_is_included(objects, obj_item):
    for ex_obj in objects:
        if ex_obj == obj_item:
            return True
    return False

def dataset_setup(id_list, relationships, raw_region_data):
    import copy
    import string
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    dataset = []
    for im_id in id_list:
        im_item = {}
        im_item['id'] = im_id
        im_item['path'] = relationships[im_id]['path']
        im_item['height'] = relationships[im_id]['height']
        im_item['width'] = relationships[im_id]['width']
        regions = []
        objects = []
        relationships_new = []
        for region in raw_region_data[im_id]['regions']:
            region_item = {}
            # print region['phrase']
            region_item['phrase'] = region['phrase'].encode('ascii', 'ignore').translate(replace_punctuation)
            region_item['box'] = (region['x'], region['y'], \
                                  region['x'] + region['width'] - 1, \
                                  region['y'] + region['height'] - 1)
            regions.append(region_item)
        for rel in relationships[im_id]['relationships']:
            obj_item = {}
            obj_item['class'] = rel['object']
            obj_item['box'] = rel['obj_box']
            if not check_is_included(objects, obj_item):
                objects.append(obj_item)
            sub_item = {}
            sub_item['class'] = rel['subject']
            sub_item['box'] = rel['sub_box']
            rel_item = {}
            if not check_is_included(objects, sub_item):
                objects.append(sub_item)    
            rel_item['sub_id'], rel_item['obj_id'] = object_matching(objects, rel)
            rel_item['predicate'] = rel['predicate']
            relationships_new.append(rel_item)
            
        im_item['regions'] = regions
        im_item['objects'] = objects
        im_item['relationships'] = relationships_new
        dataset.append(im_item)
    return dataset

def output_dataset(dataset, filename):
    import json
    with open(filename, 'w') as fp:
        json.dump(dataset, fp)
    print "Done output dataset: "+filename
    
def output_category_list(selected_obj, selected_pred, filename):
    categories = {'object': list(selected_obj), 'predicate': list(selected_pred)}
    
    import json
    with open(filename, 'w') as fp:
        json.dump(categories, fp)
    print "Done output category list: " + filename

In [7]:
training_small_set = dataset_setup(training_small_list, relationships, region_data)
testing_small_set = dataset_setup(testing_small_list, relationships, region_data)
training_set = dataset_setup(training_list, relationships, region_data)
testing_set = dataset_setup(testing_list, relationships, region_data)
training_fat_set = dataset_setup(training_fat_list, relationships, region_data)

In [15]:
training_small_set[0]['relationships']

[{'obj_id': 0, 'predicate': 'on', 'sub_id': 1},
 {'obj_id': 2, 'predicate': 'on', 'sub_id': 3},
 {'obj_id': 4, 'predicate': u'cover', 'sub_id': 5},
 {'obj_id': 6, 'predicate': 'on', 'sub_id': 7},
 {'obj_id': 8, 'predicate': 'on', 'sub_id': 5},
 {'obj_id': 6, 'predicate': 'on', 'sub_id': 9}]

In [16]:
training_small_set[0]['objects']

[{'box': (74, 32, 94, 82), 'class': 'hand'},
 {'box': (40, 24, 89, 96), 'class': u'glove'},
 {'box': (100, 51, 129, 72), 'class': 'head'},
 {'box': (96, 46, 123, 71), 'class': 'helmet'},
 {'box': (10, 261, 489, 330), 'class': 'ground'},
 {'box': (1, 277, 497, 331), 'class': 'snow'},
 {'box': (0, 268, 486, 329), 'class': 'ground'},
 {'box': (261, 265, 382, 321), 'class': 'snow'},
 {'box': (0, 277, 497, 330), 'class': 'ground'},
 {'box': (6, 280, 497, 329), 'class': 'snow'}]

In [18]:
relationships[training_small_list[0]]['relationships']

[{'obj_box': (74, 32, 94, 82),
  'object': 'hand',
  'predicate': 'on',
  'sub_box': (40, 24, 89, 96),
  'subject': u'glove'},
 {'obj_box': (100, 51, 129, 72),
  'object': 'head',
  'predicate': 'on',
  'sub_box': (96, 46, 123, 71),
  'subject': 'helmet'},
 {'obj_box': (10, 261, 489, 330),
  'object': 'ground',
  'predicate': u'cover',
  'sub_box': (1, 277, 497, 331),
  'subject': 'snow'},
 {'obj_box': (0, 268, 486, 329),
  'object': 'ground',
  'predicate': 'on',
  'sub_box': (261, 265, 382, 321),
  'subject': 'snow'},
 {'obj_box': (0, 277, 497, 330),
  'object': 'ground',
  'predicate': 'on',
  'sub_box': (1, 277, 497, 331),
  'subject': 'snow'},
 {'obj_box': (0, 268, 486, 329),
  'object': 'ground',
  'predicate': 'on',
  'sub_box': (6, 280, 497, 329),
  'subject': 'snow'}]

In [9]:
output_dir = 'output/top_150_50/'
output_dataset(training_small_set, output_dir+'train_small.json')
output_dataset(testing_small_set, output_dir+'test_small.json')
output_dataset(training_set, output_dir+'train.json')
output_dataset(testing_set, output_dir+'test.json')
output_dataset(training_fat_set, output_dir+'train_fat.json')
output_category_list(selected_obj, selected_pred, output_dir + 'categories.json')

Done output dataset: output/top_150_50/train_small.json
Done output dataset: output/top_150_50/test_small.json
Done output dataset: output/top_150_50/train.json
Done output dataset: output/top_150_50/test.json
Done output dataset: output/top_150_50/train_fat.json
Done output category list: output/top_150_50/categories.json


## Buildup dictionary for sentences

In [10]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

def buildup_dictionary(trainset, testset, filename):
    import json
    dictionary = Dictionary()
    print('First, training set...')
    for im in trainset:
        for region in im['regions']:
            words = region['phrase'].split() + ['<eos>']
            for word in words:
                    dictionary.add_word(word)
    print('Then, testing set...')
    for im in testset:
        for region in im['regions']:
            words = region['phrase'].split() + ['<eos>']
            for word in words:
                    dictionary.add_word(word)
                    
    print 'Dictionary has %d words' % len(dictionary.idx2word)
    dict_ = {'word2idx': dictionary.word2idx, 'idx2word': dictionary.idx2word}
    with open(filename, 'w') as fp:
        json.dump(dict_, fp)
    print "Done output category list: " + filename
    return dict_
        

In [11]:
buildup_dictionary(training_set, testing_set, output_dir+'dict.json')

First, training set...
Then, testing set...
Dictionary has 78300 words
Done output category list: output/top_150_50/dict.json


{'idx2word': ['a',
  'snowboarder',
  'in',
  'the',
  'air',
  '<eos>',
  'one',
  'arm',
  'straight',
  'black',
  'gloves',
  'on',
  'hands',
  'yellow',
  'patches',
  'pants',
  'gray',
  'jacket',
  'with',
  'hood',
  'white',
  'helmet',
  'head',
  'red',
  'and',
  'snow',
  'board',
  'blue',
  'around',
  'chin',
  'goggles',
  'face',
  'particles',
  'edge',
  'fo',
  'of',
  'part',
  'ground',
  'boarrd',
  'cold',
  'snowboard',
  'person',
  'snowboarding',
  'doing',
  'trick',
  'pair',
  'googles',
  'mittens',
  'covering',
  'heavy',
  'hand',
  'under',
  'thier',
  'A',
  'man',
  'playing',
  'diving',
  'heap',
  'clear',
  'sky',
  'background',
  'numerical',
  'clock',
  'hanging',
  'tree',
  'full',
  'clocks',
  'shopping',
  'basket',
  'car',
  'parked',
  'side',
  'road',
  'up',
  'street',
  'covered',
  'circles',
  'chock',
  'paint',
  'silver',
  'van',
  'house',
  'across',
  'bunch',
  'trees',
  'stairs',
  'railing',
  'has',
  'metal',