In [1]:
import sys
import os
import json
import h5py
import numpy as np

import torch
from torchtext import data

In [2]:
import vr.programs
from vr.preprocess import tokenize, encode, build_vocab

In [3]:
def program_to_str(program, mode):
    if mode == 'chain':
        if not vr.programs.is_chain(program):
            return None
        return vr.programs.list_to_str(program)
    elif mode == 'prefix':
        program_prefix = vr.programs.list_to_prefix(program)
        return vr.programs.list_to_str(program_prefix)
    elif mode == 'postfix':
        program_postfix = vr.programs.list_to_postfix(program)
        return vr.programs.list_to_str(program_postfix)
    return None

In [4]:
def encode_data(output_h5_file, questions, vocab):
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []

    for orig_idx, q in enumerate(questions):
        question = q['question']
        if 'program' in q:
            types += [q['program'][-1]['function']] # takes last program for a given input and the type of program
        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                            punct_to_keep=[';', ','],
                            punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                             vocab['question_token_to_idx'],
                             allow_unk=encode_unk == 1)
        questions_encoded.append(question_encoded)
        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)
        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])
                
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    
    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i
    print(mapping)
    types_coded = []
    for t in types:
        types_coded += [mapping[t]]
    
    with h5py.File(output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families', data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))
    

In [5]:
def preprocess(output_h5_file, input_questions_json, input_vocab_json='', output_vocab_json=''):
    
    if (input_vocab_json == '') and (output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return
    
    print('Loading data')
    with open(input_questions_json, 'r') as f:
        questions = json.load(f)['questions']
        
    if input_vocab_json == '' or expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab(
                (q['answer'] for q in questions)
            )
        question_token_to_idx = build_vocab(
            (q['question'] for q in questions),
            min_token_count=unk_threshold,
            punct_to_keep=[';', ','], punct_to_remove=['?', '.']
        )
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }
    
    if input_vocab_json != '':
        print('Loading vocab')
        if expand_vocab == 1:
            new_vocab = vocab
        with open(input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    
    if output_vocab_json != '':
        with open(output_vocab_json, 'w') as f:
            json.dump(vocab, f)
            
    encode_data(output_h5_file, questions, vocab)

**====================================================== Args =========================================================**

In [6]:
mode = 'prefix'
expand_vocab = 0
unk_threshold = 1
encode_unk = 0

In [7]:
train_questions_path = 'data/CLEVR_v1.0/questions/CLEVR_train_questions.json'
valid_questions_path = 'data/CLEVR_v1.0/questions/CLEVR_val_questions.json'
test_questions_path = 'data/CLEVR_v1.0/questions/CLEVR_test_questions.json'

train_output_h5_file = 'data/train_questions.h5'
val_output_h5_file = 'data/val_questions.h5'
test_output_h5_file = 'data/test_questions.h5'

**====================================================== Main =========================================================**




In [8]:
preprocess(train_output_h5_file, train_questions_path, output_vocab_json='data/vocab.json')
preprocess(val_output_h5_file, valid_questions_path, input_vocab_json='data/vocab.json')
preprocess(test_output_h5_file, test_questions_path, input_vocab_json='data/vocab.json')

Loading data
Building vocab
Encoding data
Writing output
(699989, 46)
(699989, 27)
{'count': 0, 'equal_size': 1, 'exist': 2, 'equal_material': 3, 'query_material': 4, 'greater_than': 5, 'equal_integer': 6, 'equal_shape': 7, 'equal_color': 8, 'less_than': 9, 'query_color': 10, 'query_shape': 11, 'query_size': 12}
Loading data
Loading vocab
Encoding data
Writing output
(149991, 46)
(149991, 27)
{'query_size': 0, 'count': 1, 'equal_size': 2, 'exist': 3, 'equal_material': 4, 'query_material': 5, 'greater_than': 6, 'equal_integer': 7, 'equal_shape': 8, 'equal_color': 9, 'query_color': 10, 'query_shape': 11, 'less_than': 12}
Loading data
Loading vocab
Encoding data
Writing output
(149988, 45)
(0,)
{}


**Encoding Programs**

In [26]:
with open('data/vocab.json', 'r') as f:
    vocab = json.load(f)

with open('data/CLEVR_v1.0/questions/CLEVR_val_questions.json', 'r') as f:
    questions = json.load(f)['questions']
    for i, q in enumerate(questions):
        if 'program' not in q: continue
        print(q['question'])
        print(q['program'])
        program_str = program_to_str(q['program'], mode)
        if program_str is not None:
            print(program_str)
        program_tokens = tokenize(program_str)
        print(program_tokens)
        program_encoded = encode(program_tokens, vocab['program_token_to_idx'])
        print(program_encoded)
        print()
        if 'answer' in q:
            print([q['answer']])
        if i == 9: break

Are there any other things that are the same shape as the big metallic object?
[{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [2], 'function': 'unique', 'value_inputs': []}, {'inputs': [3], 'function': 'same_shape', 'value_inputs': []}, {'inputs': [4], 'function': 'exist', 'value_inputs': []}]
exist same_shape unique filter_material[metal] filter_size[large] scene
['<START>', 'exist', 'same_shape', 'unique', 'filter_material[metal]', 'filter_size[large]', 'scene', '<END>']
[1, 10, 39, 43, 19, 24, 41, 2]

['no']
Is there a big brown object of the same shape as the green thing?
[{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_color', 'value_inputs': ['green']}, {'inputs': [1], 'function': 'unique', 'value_inputs': []}, {'inputs': [2], 'function': 'same_shape', 'value_inputs': []