### Preliminary Data Exploration
Author: catwong@ 12/27/2018

Datasets:
- Regex, Learning with Latent Language (Andreas et. al) [https://github.com/jacobandreas/l3/tree/master/data]
- Spatial Navigation (Janner et. al)
[https://github.com/JannerM/spatial-reasoning]
- CLEVR-Humans (Johnson et. al) [https://cs.stanford.edu/people/jcjohns/iep/]

### Analyze Datasets

#### Utility Functions

In [1]:
%autocall 1 
# For print

Automatic calling is: Smart


In [3]:

from collections import Counter
import numpy as np
import random

def ngram_dataset_freq(dataset, key, n=1, verbose=False):
    """Frequency distribution of ngrams across the entire dataset."""
    fdist = Counter()
    num_descriptions = 0
    lens = []
    diversity = []
    untokenized = []
    
    for example in dataset:
        descriptions = []
        if isinstance(example[key][0], list):
            for description in example[key]:
                if description[0] == '<': # Remove start/end tokens
                    start, end = 1, -1
                else:
                    start, end = 0, len(description)
                descriptions += [description[start:end]]

        else:
            description = example[key]
            if description[0] == '<': # Remove start/end tokens
                start, end = 1, -1
            else:
                start, end = 0, len(description)
            descriptions += [description[start:end]]
        
        # n-grams on a per-example basis
        fdist_in_task = Counter()
        for description in descriptions:
            if len(description) > 0:
                for i in range(len(description) - n + 1):
                    fdist[tuple(description[i:i+n])] += 1
                    fdist_in_task[tuple(description[i:i+n])] += 1
                lens.append(len(description))
                num_descriptions += len(descriptions)
    
                untokenized.append(" ".join(description))
        if len(descriptions) > 1:
            diversity.append(float(len(fdist_in_task)) / np.sum(list(fdist_in_task.values())))
                
    if verbose:
        common_ngrams = [(word, num) for (word, num) in fdist.most_common(100) if (len(word) > 1 or len(word[0]) > 1 )][:50]
        
        
        print("Printing for ngram, n=%d" % n)
        print("Num descriptions: %d" % num_descriptions)
        print("Description avg: %d, med: %d, min: %d, max: %d" % (np.mean(lens), np.median(lens), np.min(lens), np.max(lens)))
        if len(diversity) > 1:
            print("Ngram diversity within tasks w. multiple examples: avg: %f, med: %f, min: %f, max: %d " % (np.mean(diversity), np.median(diversity), np.min(diversity), np.max(diversity)))
        print("Vocabulary size: %d" % len(fdist) )
        print("Ngrams with freq > 10: %d" % len([word for word in fdist if fdist[word] > 10]))
        print("Total ngram in corpus: %d" % np.sum(list(fdist.values())))
        print("50 most common: (not including letters): " + str(common_ngrams))
        print("Sample descriptions: ")
        rand = random.sample(untokenized, 5)
        for description in rand:
            print(description)
        
    return fdist

def ngram_cross_dataset_freq(fdists, verbose=False):
    """Frequency distributions intersected across several fdists from disparate datasets."""
    summed_fdist = Counter()
    for fdist in fdists:
        summed_fdist += fdist
        
    # Only get the intersecting vocabulary
    intersect_vocab = set.intersection(*[set(fdist.keys()) for fdist in fdists])
    intersected_fdist=Counter()
    for vocab in intersect_vocab:
        intersected_fdist[vocab] = summed_fdist[vocab]
    
    if verbose:
        common_ngrams = intersected_fdist.most_common(50)
        print("Cross dataset frequency for %d datasets." % len(fdists))
        print("Original vocabulary sizes are %s" % str([len(fdist) for fdist in fdists]))
        print("Combined vocabulary size is %d; intersected vocab is: %d" %(len(summed_fdist), len(intersect_vocab)))
        
        print("Intersection ngrams with freq > 10: %d" % len([word for word in intersected_fdist if intersected_fdist[word] > 10]))
        print("50 most common: (not including letters): " + str(common_ngrams))
    return intersected_fdist
        
#train_hint = ngram_dataset_freq(l3_regex['train'], 'hint', n=1, verbose=True)
local_sr_fdist = ngram_dataset_freq(local_sr['train'], 'hints_aug', verbose=True)
clevr_fdist = ngram_dataset_freq(clevr_humans['train'], 'tokenized', verbose=False)

_ = ngram_cross_dataset_freq([local_sr_fdist, clevr_fdist], verbose=True)

NameError: name 'local_sr' is not defined

#### L3-Regex

l3_regex: dict with keys {train, test, val}; each list of dicts with keys:
- examples: actual I/O pairs.
- hint: the actual NLP examples.
- hints_aug: templated, augmented.
- re: the regex

In [4]:
import json 

path = "./data/l3_regex/corpus.json"
json_data = open(path).read()
l3_regex = json.loads(json_data)

In [5]:
print "Basic summary: "
for slice in ['train', 'val', 'test']:
    n = len(l3_regex[slice])
    print "%s: %d tasks" % (slice, n)
    


SyntaxError: Missing parentheses in call to 'print'. Did you mean print("Basic summary: ")? (<ipython-input-5-578eda2adfe0>, line 1)

In [18]:
# Frequency Distributions

train_hint = ngram_dataset_freq(l3_regex['train'], 'hints_aug', verbose=True)
test_hint = ngram_dataset_freq(l3_regex['test'], 'hints_aug', verbose=True)

train_hint = ngram_dataset_freq(l3_regex['train'], 'hints_aug', n=2, verbose=True)
test_hint = ngram_dataset_freq(l3_regex['test'], 'hints_aug', n=2, verbose=True)

Printing for ngram, n=1
Num descriptions: 15840002
Description avg: 8, med: 7, min: 1, max: 44
Ngram diversity within tasks w. multiple examples: avg: 0.232712, med: 0.190391, min: 0.056038, max: 1 
Vocabulary size: 944
Ngrams with freq > 10: 462
Total ngram in corpus: 1242017
50 most common: (not including letters): [((u'with',), 107416), ((u'the',), 83882), ((u'replace',), 81802), ((u'letter',), 69266), ((u'word',), 31077), ((u'first',), 27903), ((u'is',), 25433), ((u'of',), 22846), ((u'consonant',), 21888), ((u'to',), 21455), ((u'replaced',), 21374), ((u'if',), 21131), ((u'change',), 19099), ((u'vowel',), 18528), ((u'all',), 15679), ((u'every',), 13222), ((u'last',), 12618), ((u'it',), 12162), ((u'by',), 11605), ((u'and',), 11477), ((u'in',), 9382), ((u'letters',), 9255), ((u'an',), 9141), ((u'two',), 8336), ((u'each',), 7070), ((u'beginning',), 6937), ((u'are',), 6724), ((u'vowels',), 6645), ((u'consonants',), 5745), ((u'that',), 4940), ((u'begins',), 4901), ((u'add',), 4635), ((u'

#### Spatial Reasoning - Janner Version

To load up to max_train train maps and max_val val maps with mode = [ local | global ] instructions and annotations = [ human | synthetic ] descriptions, run:

~~~~
>>> import data
>>> train_data, val_data = data.load(mode, annotations, max_train, max_val)
>>> layouts, objects, rewards, terminal, instructions, values, goals = train_data
~~~~
Local: 1566 train, 399 test
Global

In [38]:
from data.spatialreasoning import *
from data.spatialreasoning import environment
import data.spatialreasoning.data as srdata

In [39]:
# Load annotations into a dataset form
annotations='human'
n_local_train, n_local_val = 1566, 399
n_global_train, n_global_val = 1071, 272
mode='global' # Local is landmkar dependent; global is 'easternmost' or 'topmost'
data_path = '/Users/catwong/Desktop/Cathy/2018-2019 First Year/CoCoSci/Semantic Parsing/data/spatialreasoning/data'
local_train, local_val = srdata.load(data_path, 'local', annotations, n_local_train, n_local_val)
global_train, global_val = srdata.load(data_path, 'global', annotations, n_global_train, n_global_val)

  1%|          | 8/1566 [00:00<00:20, 76.11it/s]


<Data> Loading local train environments with human annotations


100%|█████████▉| 1563/1566 [00:18<00:00, 80.47it/s]
  0%|          | 0/399 [00:00<?, ?it/s][A
  2%|▏         | 9/399 [00:00<00:04, 86.85it/s][A

<Data> Found 1566 annotations

<Data> Loading local test environments with human annotations



  5%|▍         | 18/399 [00:00<00:04, 86.88it/s][A
  7%|▋         | 26/399 [00:00<00:04, 82.67it/s][A
  9%|▉         | 35/399 [00:00<00:04, 83.46it/s][A
 11%|█         | 43/399 [00:00<00:04, 82.16it/s][A
 13%|█▎        | 52/399 [00:00<00:04, 82.25it/s][A
 15%|█▌        | 61/399 [00:00<00:04, 83.89it/s][A
 18%|█▊        | 71/399 [00:00<00:03, 87.14it/s][A
 20%|██        | 80/399 [00:00<00:03, 87.11it/s][A
 22%|██▏       | 89/399 [00:01<00:03, 86.94it/s][A
 25%|██▍       | 98/399 [00:01<00:03, 87.22it/s][A
 27%|██▋       | 107/399 [00:01<00:03, 85.93it/s][A
 29%|██▉       | 116/399 [00:01<00:03, 86.38it/s][A
 31%|███▏      | 125/399 [00:01<00:03, 87.24it/s][A
 34%|███▍      | 135/399 [00:01<00:02, 89.20it/s][A
 36%|███▌      | 144/399 [00:01<00:02, 88.47it/s][A
 39%|███▊      | 154/399 [00:01<00:02, 89.46it/s][A
 41%|████      | 163/399 [00:01<00:02, 87.64it/s][A
 43%|████▎     | 172/399 [00:01<00:02, 85.11it/s][A
 45%|████▌     | 181/399 [00:02<00:02, 85.35it/s][A
 4

<Data> Found 399 annotations

<Data> Loading global train environments with human annotations




  3%|▎         | 29/1071 [00:00<00:07, 137.08it/s][A[A

  4%|▍         | 45/1071 [00:00<00:07, 141.53it/s][A[A

  5%|▌         | 58/1071 [00:00<00:07, 136.21it/s][A[A

  7%|▋         | 70/1071 [00:00<00:07, 130.23it/s][A[A

  8%|▊         | 85/1071 [00:00<00:07, 132.63it/s][A[A

  9%|▉         | 99/1071 [00:00<00:07, 133.55it/s][A[A

 11%|█         | 113/1071 [00:00<00:07, 133.62it/s][A[A

 12%|█▏        | 127/1071 [00:00<00:07, 132.52it/s][A[A

 13%|█▎        | 142/1071 [00:01<00:06, 136.12it/s][A[A

 15%|█▍        | 157/1071 [00:01<00:06, 138.36it/s][A[A

 16%|█▋        | 176/1071 [00:01<00:06, 148.93it/s][A[A

 18%|█▊        | 194/1071 [00:01<00:05, 154.68it/s][A[A

 20%|█▉        | 210/1071 [00:01<00:05, 151.95it/s][A[A

 21%|██        | 227/1071 [00:01<00:05, 155.77it/s][A[A

 23%|██▎       | 244/1071 [00:01<00:05, 155.83it/s][A[A

 24%|██▍       | 260/1071 [00:01<00:05, 151.92it/s][A[A

 26%|██▌       | 277/1071 [00:01<00:05, 155.93it/s][A[A

 

<Data> Found 1071 annotations

<Data> Loading global test environments with human annotations





 10%|█         | 28/272 [00:00<00:01, 136.80it/s][A[A[A


 17%|█▋        | 45/272 [00:00<00:01, 144.84it/s][A[A[A


 23%|██▎       | 62/272 [00:00<00:01, 151.42it/s][A[A[A


 29%|██▉       | 79/272 [00:00<00:01, 156.41it/s][A[A[A


 35%|███▌      | 96/272 [00:00<00:01, 157.68it/s][A[A[A


 42%|████▏     | 115/272 [00:00<00:00, 163.20it/s][A[A[A


 48%|████▊     | 131/272 [00:00<00:00, 160.36it/s][A[A[A


 55%|█████▌    | 150/272 [00:00<00:00, 165.49it/s][A[A[A


 62%|██████▏   | 169/272 [00:01<00:00, 168.71it/s][A[A[A


 69%|██████▉   | 187/272 [00:01<00:00, 170.86it/s][A[A[A


 75%|███████▌  | 204/272 [00:01<00:00, 169.60it/s][A[A[A


 82%|████████▏ | 222/272 [00:01<00:00, 170.46it/s][A[A[A


 88%|████████▊ | 239/272 [00:01<00:00, 168.20it/s][A[A[A


 94%|█████████▍| 257/272 [00:01<00:00, 171.25it/s][A[A[A


                                                  [A[A[A

<Data> Found 272 annotations


In [40]:
def make_sr_dataset(raw_train, raw_test):
    sr_dataset = {'train': [], 'test': []}
    for i, dataset in enumerate((raw_train, raw_test)):
        layouts, objects, rewards, terminal, instructions, values, goals = dataset
        mode = 'train' if i == 0 else 'test'
        print("Found %d %s instructions." % (len(instructions), mode))
        
        for instruction in instructions:
            sr_dataset[mode].append({
                'hints_aug': instruction.split()
            })
    return sr_dataset
            
local_sr = make_sr_dataset(local_train, local_val)
global_sr = make_sr_dataset(global_train, global_val)

Found 1566 train instructions.
Found 399 test instructions.
Found 1071 train instructions.
Found 272 test instructions.



100%|██████████| 399/399 [00:21<00:00, 82.40it/s][A

100%|██████████| 1071/1071 [00:26<00:00, 138.70it/s][A[A

In [38]:
print ("LOCAL:")
_= ngram_dataset_freq(local_sr['train'], 'hints_aug', verbose=True)
_= ngram_dataset_freq(local_sr['test'], 'hints_aug', verbose=True)

_= ngram_dataset_freq(local_sr['train'], 'hints_aug', n=2, verbose=True)
_= ngram_dataset_freq(local_sr['test'], 'hints_aug', n=2, verbose=True)


LOCAL:
Printing for ngram, n=1
Num descriptions: 1566
Description avg: 8, med: 8, min: 2, max: 24
Vocabulary size: 196
Ngrams with freq > 10: 62
Total ngram in corpus: 12757
50 most common: (not including letters): [((u'the',), 1780), ((u'reach',), 1082), ((u'cell',), 805), ((u'of',), 695), ((u'to',), 684), ((u'one',), 492), ((u'left',), 450), ((u'and',), 440), ((u'above',), 369), ((u'right',), 350), ((u'below',), 325), ((u'heart',), 254), ((u'house',), 248), ((u'tree',), 240), ((u'circle',), 235), ((u'spade',), 231), ((u'two',), 230), ((u'diamond',), 229), ((u'rock',), 217), ((u'horse',), 209), ((u'triangle',), 190), ((u'star',), 189), ((u'space',), 163), ((u'square',), 145), ((u'with',), 135), ((u'is',), 118), ((u'that',), 113), ((u'go',), 104), ((u'from',), 97), ((u'between',), 81), ((u'spaces',), 71), ((u'move',), 70), ((u'blue',), 70), ((u'down',), 66), ((u'up',), 63), ((u'cells',), 39), ((u'directly',), 39), ((u'green',), 34), ((u'purple',), 34), ((u'under',), 34), ((u'goal',), 3

In [39]:
print ("\nGLOBAL:")
_= ngram_dataset_freq(global_sr['train'], 'hints_aug', verbose=True)
_= ngram_dataset_freq(global_sr['test'], 'hints_aug', verbose=True)

_= ngram_dataset_freq(global_sr['train'], 'hints_aug', n=2, verbose=True)
_= ngram_dataset_freq(global_sr['test'], 'hints_aug', n=2, verbose=True)



GLOBAL:
Printing for ngram, n=1
Num descriptions: 1071
Description avg: 8, med: 8, min: 2, max: 22
Vocabulary size: 191
Ngrams with freq > 10: 56
Total ngram in corpus: 8702
50 most common: (not including letters): [((u'the',), 1599), ((u'to',), 801), ((u'cell',), 473), ((u'go',), 425), ((u'of',), 419), ((u'horse',), 294), ((u'house',), 292), ((u'most',), 272), ((u'rock',), 265), ((u'tree',), 254), ((u'left',), 251), ((u'one',), 244), ((u'move',), 226), ((u'right',), 189), ((u'above',), 175), ((u'square',), 158), ((u'reach',), 144), ((u'topmost',), 144), ((u'below',), 129), ((u'bottom',), 104), ((u'top',), 84), ((u'easternmost',), 63), ((u'westernmost',), 60), ((u'bottommost',), 55), ((u'is',), 51), ((u'directly',), 44), ((u'leftmost',), 43), ((u'southernmost',), 42), ((u'western',), 42), ((u'rightmost',), 40), ((u'lowest',), 40), ((u'two',), 38), ((u'that',), 36), ((u'northernmost',), 33), ((u'with',), 31), ((u'on',), 28), ((u'eastern',), 27), ((u'southern',), 24), ((u'and',), 23), (

### Spatial Reasoning - L3 Version

Note: additional processing is in the l3 tasks - generates additional templates

In [7]:
# Load annotations into a dataset form
annotations='human'
n_local_train, n_local_val = 2000, 500
n_global_train, n_global_val = 1500, 500
mode='global' # Local is landmkar dependent; global is 'easternmost' or 'topmost'
data_path = '/Users/catwong/Desktop/Cathy/2018-2019 First Year/CoCoSci/Semantic Parsing/data/l3_nav'
local_train, local_val = srdata.load(data_path, 'local', annotations, n_local_train, n_local_val)
global_train, global_val = srdata.load(data_path, 'global', annotations, n_global_train, n_global_val)








  0%|          | 0/2000 [00:00<?, ?it/s][A[A[A[A[A[A[A






  0%|          | 9/2000 [00:00<00:23, 84.90it/s][A[A[A[A[A[A[A


<Data> Loading local train environments with human annotations









  1%|          | 16/2000 [00:00<00:25, 78.77it/s][A[A[A[A[A[A[A






  1%|          | 24/2000 [00:00<00:25, 78.01it/s][A[A[A[A[A[A[A






  2%|▏         | 32/2000 [00:00<00:25, 78.29it/s][A[A[A[A[A[A[A






  2%|▏         | 40/2000 [00:00<00:25, 78.24it/s][A[A[A[A[A[A[A






  2%|▏         | 49/2000 [00:00<00:24, 79.43it/s][A[A[A[A[A[A[A






  3%|▎         | 57/2000 [00:00<00:24, 79.27it/s][A[A[A[A[A[A[A






  3%|▎         | 66/2000 [00:00<00:23, 81.54it/s][A[A[A[A[A[A[A






  4%|▍         | 75/2000 [00:00<00:23, 81.87it/s][A[A[A[A[A[A[A






  4%|▍         | 84/2000 [00:01<00:23, 81.97it/s][A[A[A[A[A[A[A






  5%|▍         | 92/2000 [00:01<00:23, 80.52it/s][A[A[A[A[A[A[A






  5%|▌         | 101/2000 [00:01<00:23, 81.54it/s][A[A[A[A[A[A[A






  6%|▌         | 110/2000 [00:01<00:23, 81.10it/s][A[A[A[A[A[A[A






  6%|▌         | 119/2000 [00:01<00:22, 83.03it/s][A[A[A[A[A[

<Data> Found 1566 annotations

<Data> Loading local test environments with human annotations










  4%|▍         | 20/500 [00:00<00:05, 94.12it/s][A[A[A[A[A[A[A[A







  6%|▌         | 29/500 [00:00<00:05, 92.18it/s][A[A[A[A[A[A[A[A







  8%|▊         | 38/500 [00:00<00:05, 89.83it/s][A[A[A[A[A[A[A[A







  9%|▉         | 45/500 [00:00<00:05, 78.68it/s][A[A[A[A[A[A[A[A







 10%|█         | 52/500 [00:00<00:05, 75.56it/s][A[A[A[A[A[A[A[A







 12%|█▏        | 61/500 [00:00<00:05, 77.03it/s][A[A[A[A[A[A[A[A







 14%|█▍        | 71/500 [00:00<00:05, 81.27it/s][A[A[A[A[A[A[A[A







 16%|█▌        | 80/500 [00:00<00:05, 81.71it/s][A[A[A[A[A[A[A[A







 18%|█▊        | 89/500 [00:01<00:05, 81.72it/s][A[A[A[A[A[A[A[A







 20%|█▉        | 99/500 [00:01<00:04, 84.35it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 108/500 [00:01<00:04, 84.04it/s][A[A[A[A[A[A[A[A







 23%|██▎       | 117/500 [00:01<00:04, 84.38it/s][A[A[A[A[A[A[A[A







 25%|██▌       | 126/500 [00:0

<Data> Found 399 annotations

<Data> Loading global train environments with human annotations










  2%|▏         | 37/1500 [00:00<00:08, 180.35it/s][A[A[A[A[A[A[A[A







  3%|▎         | 52/1500 [00:00<00:08, 169.60it/s][A[A[A[A[A[A[A[A







  5%|▍         | 68/1500 [00:00<00:08, 163.17it/s][A[A[A[A[A[A[A[A







  6%|▌         | 87/1500 [00:00<00:08, 168.34it/s][A[A[A[A[A[A[A[A







  7%|▋         | 106/1500 [00:00<00:08, 172.87it/s][A[A[A[A[A[A[A[A







  8%|▊         | 122/1500 [00:00<00:08, 166.27it/s][A[A[A[A[A[A[A[A







  9%|▉         | 138/1500 [00:00<00:08, 161.57it/s][A[A[A[A[A[A[A[A







 10%|█         | 154/1500 [00:00<00:08, 156.47it/s][A[A[A[A[A[A[A[A







 12%|█▏        | 173/1500 [00:01<00:08, 164.88it/s][A[A[A[A[A[A[A[A







 13%|█▎        | 191/1500 [00:01<00:07, 167.44it/s][A[A[A[A[A[A[A[A







 14%|█▍        | 208/1500 [00:01<00:07, 167.27it/s][A[A[A[A[A[A[A[A







 15%|█▌        | 226/1500 [00:01<00:07, 168.77it/s][A[A[A[A[A[A[A[A









<Data> Found 1071 annotations

<Data> Loading global test environments with human annotations











  7%|▋         | 34/500 [00:00<00:02, 168.38it/s][A[A[A[A[A[A[A[A[A








 10%|█         | 51/500 [00:00<00:02, 166.20it/s][A[A[A[A[A[A[A[A[A








 14%|█▍        | 69/500 [00:00<00:02, 169.87it/s][A[A[A[A[A[A[A[A[A








 17%|█▋        | 86/500 [00:00<00:02, 168.43it/s][A[A[A[A[A[A[A[A[A








 20%|██        | 102/500 [00:00<00:02, 164.03it/s][A[A[A[A[A[A[A[A[A








 24%|██▍       | 120/500 [00:00<00:02, 167.83it/s][A[A[A[A[A[A[A[A[A








 27%|██▋       | 136/500 [00:00<00:02, 164.48it/s][A[A[A[A[A[A[A[A[A








 31%|███       | 155/500 [00:00<00:02, 169.72it/s][A[A[A[A[A[A[A[A[A








 34%|███▍      | 172/500 [00:01<00:01, 168.75it/s][A[A[A[A[A[A[A[A[A








 38%|███▊      | 191/500 [00:01<00:01, 172.36it/s][A[A[A[A[A[A[A[A[A








 42%|████▏     | 208/500 [00:01<00:01, 168.70it/s][A[A[A[A[A[A[A[A[A








 45%|████▌     | 227/500 [00:01<00:01, 172.43it/s

<Data> Found 272 annotations









 78%|███████▊  | 1566/2000 [00:35<00:05, 72.93it/s][A[A[A[A[A[A[A







 71%|███████▏  | 1071/1500 [00:20<00:03, 130.10it/s][A[A[A[A[A[A[A[A

In [8]:
def make_sr_dataset(raw_train, raw_test):
    sr_dataset = {'train': [], 'test': []}
    for i, dataset in enumerate((raw_train, raw_test)):
        layouts, objects, rewards, terminal, instructions, values, goals = dataset
        mode = 'train' if i == 0 else 'test'
        print("Found %d %s instructions." % (len(instructions), mode))
        
        for instruction in instructions:
            sr_dataset[mode].append({
                'hints_aug': instruction.split()
            })
    return sr_dataset
            
local_sr = make_sr_dataset(local_train, local_val)
global_sr = make_sr_dataset(global_train, global_val)

Found 1566 train instructions.
Found 399 test instructions.
Found 1071 train instructions.
Found 272 test instructions.


In [9]:
print ("LOCAL:")
_= ngram_dataset_freq(local_sr['train'], 'hints_aug', verbose=True)
_= ngram_dataset_freq(local_sr['test'], 'hints_aug', verbose=True)

_= ngram_dataset_freq(local_sr['train'], 'hints_aug', n=2, verbose=True)
_= ngram_dataset_freq(local_sr['test'], 'hints_aug', n=2, verbose=True)


LOCAL:
Printing for ngram, n=1
Num descriptions: 1566
Description avg: 6, med: 6, min: 0, max: 22
Vocabulary size: 166
Ngrams with freq > 10: 57
Total ngram in corpus: 9625
50 most common: (not including letters): [((u'the',), 1735), ((u'cell',), 766), ((u'of',), 695), ((u'to',), 678), ((u'and',), 440), ((u'one',), 435), ((u'left',), 420), ((u'above',), 360), ((u'right',), 328), ((u'below',), 317), ((u'two',), 209), ((u'house',), 172), ((u'space',), 159), ((u'rock',), 158), ((u'tree',), 149), ((u'horse',), 140), ((u'with',), 135), ((u'square',), 130), ((u'is',), 116), ((u'that',), 113), ((u'heart',), 113), ((u'circle',), 106), ((u'from',), 97), ((u'spade',), 97), ((u'diamond',), 92), ((u'star',), 85), ((u'triangle',), 82), ((u'between',), 77), ((u'spaces',), 71), ((u'blue',), 69), ((u'down',), 58), ((u'up',), 58), ((u'cells',), 39), ((u'directly',), 38), ((u'under',), 34), ((u'green',), 33), ((u'purple',), 32), ((u'goal',), 31), ((u'bottom',), 30), ((u'next',), 28), ((u'in',), 24), ((u

In [31]:
' '.join(local_sr['train'][40]['hints_aug'])

u'go to the empty cell under the circle .'

### CLEVR-Humans

Note: official paper preprocessing is available here. https://github.com/facebookresearch/clevr-iep/blob/master/TRAINING.md

Format: JSON files have keys ['info', 'questions']; questions is a list with format:
```
{u'answer': u'yes', u'question': u'Is there a blue cylinder?', u'split': u'train', u'image_index': 1429, u'image_filename': u'CLEVR_train_001429.png'}
```

In [23]:
import json 

def tokenize(s, delim=' ',
      add_start_token=True, add_end_token=True,
      punct_to_keep=[';', ','], punct_to_remove=['?', '.']):
    """Taken from Johnson et. al"""
    s = s.lower()
    if punct_to_keep is not None:
        for p in punct_to_keep:
            s = s.replace(p, '%s%s' % (delim, p))
    if punct_to_remove is not None:
        for p in punct_to_remove:
            s = s.replace(p, '')
    tokens = s.split(delim)

    return tokens

paths = ["./data/clevr_humans/CLEVR-Humans-%s.json" % split for split in ("train", "test", "val")]

clevr_humans = {}
for split in ('train', 'test', 'val'):
    path = "./data/clevr_humans/CLEVR-Humans-%s.json" % split
    json_data = open(path).read()
    clevr_humans[split] = json.loads(json_data)['questions']
    print("Found %d questions in %s" % (len(clevr_humans[split]), split))
    # Tokenize
    for j, example in enumerate(clevr_humans[split]):
        clevr_humans[split][j]['tokenized'] = tokenize(clevr_humans[split][j]['question'])
        

Found 17817 questions in train
Found 7145 questions in test
Found 7202 questions in val


In [37]:
print("TRAIN")
_= ngram_dataset_freq(clevr_humans['train'], 'tokenized', verbose=True)
_= ngram_dataset_freq(clevr_humans['train'], 'tokenized', n=2, verbose=True)

TRAIN
Printing for ngram, n=1
Num descriptions: 17817
Description avg: 8, med: 8, min: 4, max: 35
Vocabulary size: 990
Ngrams with freq > 10: 293
Total ngram in corpus: 155133
50 most common: (not including letters): [((u'the',), 20305), ((u'is',), 9668), ((u'are',), 8744), ((u'what',), 7279), ((u'color',), 5413), ((u'of',), 5090), ((u'how',), 4956), ((u'many',), 4947), ((u'there',), 4497), ((u'objects',), 4180), ((u'object',), 3829), ((u'shape',), 2889), ((u'same',), 2564), ((u'cube',), 2249), ((u'in',), 1970), ((u'cylinder',), 1906), ((u'large',), 1874), ((u'shiny',), 1771), ((u'cubes',), 1680), ((u'small',), 1569), ((u'sphere',), 1563), ((u'that',), 1442), ((u'cylinders',), 1438), ((u'metallic',), 1422), ((u'to',), 1386), ((u'as',), 1278), ((u'red',), 1257), ((u'matte',), 1238), ((u'purple',), 1182), ((u'green',), 1173), ((u'material',), 1169), ((u'blue',), 1159), ((u'any',), 1140), ((u'spheres',), 1128), ((u'and',), 1100), ((u'all',), 1081), ((u'ball',), 1051), ((u'yellow',), 896),

### Cross Domain Frequency Analyses

#### Spatial Reasoning (Janner) and CLEVR-Humans

In [60]:
print("Spatial Reasoning Local and CLEVR-Humans")
local_sr_fdist = ngram_dataset_freq(local_sr['train'], 'hints_aug', verbose=False)
clevr_fdist = ngram_dataset_freq(clevr_humans['train'], 'tokenized', verbose=False)
_ = ngram_cross_dataset_freq([local_sr_fdist, clevr_fdist], verbose=True)

local_sr_fdist = ngram_dataset_freq(local_sr['train'], 'hints_aug', n=2, verbose=False)
clevr_fdist = ngram_dataset_freq(clevr_humans['train'], 'tokenized', n=2, verbose=False)
_ = ngram_cross_dataset_freq([local_sr_fdist, clevr_fdist], verbose=True)

Spatial Reasoning Local and CLEVR-Humans
Cross dataset frequency for 2 datasets.
Original vocabulary sizes are [196, 990]
Combined vocabulary size is 1069; intersected vocab is: 117
Intersection ngrams with freq > 10: 89
50 most common: (not including letters): [((u'the',), 22085), ((u'is',), 9786), ((u'are',), 8745), ((u'of',), 5785), ((u'object',), 3830), ((u'same',), 2565), ((u'to',), 2070), ((u'in',), 1995), ((u'that',), 1555), ((u'and',), 1540), ((u'a',), 1437), ((u'red',), 1270), ((u'blue',), 1229), ((u'purple',), 1216), ((u'green',), 1207), ((u'left',), 1168), ((u'two',), 1065), ((u'right',), 1042), ((u'yellow',), 910), ((u'one',), 780), ((u',',), 639), ((u'brown',), 470), ((u'on',), 456), ((u'other',), 456), ((u'most',), 396), ((u'square',), 395), ((u'above',), 390), ((u'from',), 375), ((u'which',), 344), ((u'between',), 339), ((u'with',), 293), ((u'next',), 276), ((u'by',), 274), ((u'closest',), 269), ((u'circle',), 255), ((u'it',), 251), ((u'diamond',), 230), ((u'gold',), 223

In [63]:
print("Spatial Reasoning Global and CLEVR-Humans")
global_sr_fdist = ngram_dataset_freq(global_sr['train'], 'hints_aug', verbose=False)
clevr_fdist = ngram_dataset_freq(clevr_humans['train'], 'tokenized', verbose=False)
_ = ngram_cross_dataset_freq([global_sr_fdist, clevr_fdist], verbose=True)

global_sr_fdist = ngram_dataset_freq(global_sr['train'], 'hints_aug', n=2, verbose=False)
clevr_fdist = ngram_dataset_freq(clevr_humans['train'], 'tokenized', n=2, verbose=False)
_ = ngram_cross_dataset_freq([global_sr_fdist, clevr_fdist], verbose=True)

Spatial Reasoning Global and CLEVR-Humans
Cross dataset frequency for 2 datasets.
Original vocabulary sizes are [191, 990]
Combined vocabulary size is 1079; intersected vocab is: 102
Intersection ngrams with freq > 10: 81
50 most common: (not including letters): [((u'the',), 21904), ((u'is',), 9719), ((u'of',), 5509), ((u'object',), 3833), ((u'same',), 2565), ((u'to',), 2187), ((u'in',), 1984), ((u'that',), 1478), ((u'a',), 1411), ((u'as',), 1279), ((u'red',), 1259), ((u'and',), 1123), ((u'all',), 1082), ((u'left',), 969), ((u'right',), 881), ((u'two',), 873), ((u'most',), 667), ((u',',), 614), ((u'or',), 597), ((u'items',), 591), ((u'one',), 532), ((u'on',), 460), ((u'other',), 458), ((u'go',), 426), ((u'square',), 408), ((u'which',), 344), ((u'farthest',), 324), ((u'only',), 311), ((u'from',), 293), ((u'between',), 266), ((u'closest',), 266), ((u'next',), 255), ((u'it',), 245), ((u'move',), 228), ((u'both',), 214), ((u'above',), 196), ((u'with',), 189), ((u'furthest',), 184), ((u'blo