# Visual Genome Dataset Cleansing

Loading dataset
-----------

- To read data from the JSON file and remove the annotations with wrong spelling.


In [1]:
import numpy as np
import math
import sys

sys.path.append('./python_code')
import preprocessing_data as imported_data

image data length: 108077
relationship data length: 108077
0 images processed, 41 relationships
Unspecified Error
2112
21
(2112, 21): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
22
(2112, 22): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
2112
23
(2112, 23): [block]-[]-[rug]

Error: [block]-[]-[rug]

Unspecified Error
3780
53
(3780, 53): [chair]-[]-[outside]

Error: [chair]-[]-[outside]

5000 images processed, 91320 relationships
Unspecified Error
6768
22
(6768, 22): [sky]-[]-[above]

Error: [sky]-[]-[above]

Unspecified Error
6839
15
(6839, 15): [is]-[]-[sidewalk]

Error: [is]-[]-[sidewalk]

Unspecified Error
6993
10
(6993, 10): [laptop]-[]-[open]

Error: [laptop]-[]-[open]

Unspecified Error
7023
21
(7023, 21): [box]-[]-[pizza hut]

Error: [box]-[]-[pizza hut]

Unspecified Error
7024
7
(7024, 7): [alcove]-[]-[open]

Error: [alcove]-[]-[open]

Unspecified Error
9015
31
(9015, 31): [girl]-[]-[outside]

Error: [girl]-[]-[outside]

Unspecified Error
9

# Filtering the Top-50 predicate and Top-150 object categories

In [7]:
def get_object_predicate_list(relationships):
    object_list = set()
    predicate_list = set()
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            object_list.add(relationship_item['subject'])
            object_list.add(relationship_item['object'])
            predicate_list.add(relationship_item['predicate'])
            
    print "Object classes: %d\nPredicate classes: %d\n" % (len(object_list), len(predicate_list))
    return object_list, predicate_list
    
def object_predicate_categories_count(relationships):
    object_list, predicate_list = get_object_predicate_list(relationships)
    freq_obj = {x:0 for x in object_list}
    freq_pred = {x:0 for x in predicate_list}
    for im_id in relationships:
        for r_id, relationship_item in enumerate(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            freq_obj[temp_sub] += 1    
            freq_obj[temp_obj] += 1
            freq_pred[temp_predicate] += 1    
    return freq_obj, freq_pred

def filter_categories(original_relationships, top_n_obj=150, top_n_pred=50):
    import copy
    original_im_num = len(original_relationships.keys())
    relationships = copy.deepcopy(original_relationships)
    freq_obj, freq_pred = object_predicate_categories_count(relationships)
    import operator
    sorted_obj = sorted(freq_obj.items(), key=operator.itemgetter(1), reverse=True)
    sorted_pred = sorted(freq_pred.items(), key=operator.itemgetter(1), reverse=True)
    selected_obj = set(x[0] for x in sorted_obj[:top_n_obj])
    selected_pred = set(x[0] for x in sorted_pred[:top_n_pred])
    for im_id in relationships.keys():
        for relationship_item in list(relationships[im_id]['relationships']):
            temp_sub = relationship_item['subject']
            temp_obj = relationship_item['object']
            temp_predicate = relationship_item['predicate']
            if temp_sub not in selected_obj or temp_obj not in selected_obj or temp_predicate not in selected_pred:
                relationships[im_id]['relationships'].remove(relationship_item)
        if len(relationships[im_id]['relationships']) == 0:
            relationships.pop(im_id)
    print "%d images left(%d/%d removed)" % \
        (len(relationships.keys()), original_im_num - len(relationships.keys()), original_im_num)
         
    return relationships, selected_obj, selected_pred

def count_instance_num(relationships):
    instance_counter = 0
    for im_id in relationships.keys():
        instance_counter+= len(relationships[im_id]['relationships'])  
    return instance_counter

def im_with_N_more_instances(relationships, N = 15):
    id_list = []
    for im_id in relationships.keys():
        if len(relationships[im_id]['relationships']) >= N:
            id_list.append(im_id)
    print "%d images with more than %d instances" % (len(id_list), N)
    return id_list

def select_top_N_ims(relationships, N):
    import operator
    instance_num = [(x, len(relationships[x]['relationships'])) for x in relationships]
    id_list = sorted(instance_num, key=operator.itemgetter(1), reverse=True)
    assert N<=len(id_list), "Select %d images from %d" % (N, len(id_list))
    return [x[0] for x in id_list[:N]]
    

## Now filter out the long-tail categories

In [3]:
relationships, selected_obj, selected_pred = filter_categories(imported_data.relationships)

Object classes: 48597
Predicate classes: 28987

95999 images left(8691/104690 removed)


## Partition dataset

1. subsample 25000 images for testing;
2. remaining 70999 images for training;
3. Different training/testing settings
    - original dataset: 70999 images for training and 25000 images for testing
    - Small dataset: 15000 images for training and 5000 for testing
    - Fat Dataset： 15000 images with 15+ instances for training and 5000 for testing

In [8]:
import random
im_list_full = relationships.keys()
random.shuffle(im_list_full)
training_list = im_list_full[:-25000]
testing_list = im_list_full[-25000:]
training_set = {x:relationships[x] for x in training_list}
testing_set = {x:relationships[x] for x in testing_list}
training_small_list = training_list[:15000]
testing_small_list = testing_list[:5000]
training_fat_list = select_top_N_ims(training_set, 15000)

## Loading region description dataset

In [9]:
import json
region_data = json.load(open('../region_descriptions_v2.json'))

In [10]:
def dataset_setup(id_list, relationships, raw_region_data):
    import copy
    dataset = []
    for im_id in id_list:
        im_item = copy.deepcopy(relationships[im_id])
        im_item['id'] = im_id
        regions = []
        for region in raw_region_data[im_id]['regions']:
            region_item = {}
            region_item['phrase'] = region['phrase']
            region_item['box'] = (region['x'], region['y'], \
                                  region['x'] + region['width'] - 1, \
                                  region['y'] + region['height'] - 1)
            regions.append(region_item)
        im_item['regions'] = regions
        dataset.append(im_item)
    return dataset

def output_dataset(dataset, filename):
    import json
    with open(filename, 'w') as fp:
        json.dump(dataset, fp)
    print "Done output dataset: "+filename

In [11]:
training_small_set = dataset_setup(training_small_list, relationships, region_data)
testing_small_set = dataset_setup(testing_small_list, relationships, region_data)
training_set = dataset_setup(training_list, relationships, region_data)
testing_set = dataset_setup(testing_list, relationships, region_data)
training_fat_set = dataset_setup(training_fat_list, relationships, region_data)

In [12]:
output_dir = 'output/top_150_50/'
output_dataset(training_small_set, output_dir+'train_small.json')
output_dataset(testing_small_set, output_dir+'test_small.json')
output_dataset(training_set, output_dir+'train.json')
output_dataset(testing_set, output_dir+'test.json')
output_dataset(training_fat_set, output_dir+'train_fat.json')

Done output dataset: output/top_150_50/train_small.json
Done output dataset: output/top_150_50/test_small.json
Done output dataset: output/top_150_50/train.json
Done output dataset: output/top_150_50/test.json
Done output dataset: output/top_150_50/train_fat.json


In [None]:
temp_data = json.load(open('t.json'))

In [None]:
training_small_set[2]

In [None]:
temp_data[2]