In [1]:
DATA_PATH = 'geo880.txt'
PP_DATA_PATH = 'pp_data.json'
VOCAB_PATH = 'vocab.json'

In [2]:
import re
import json
import random
import numpy as np
from collections import Counter
from string import punctuation
en_puns = punctuation

In [3]:
# read data by lines
with open(DATA_PATH, 'r') as f:
    raw_data_list = f.read().splitlines()

In [4]:
data_size = len(raw_data_list)
data_size

880

In [6]:
# preprocess entire pair
for i in range(len(raw_data_list)):
    raw_data_list[i] = raw_data_list[i].lower()
    raw_data_list[i] = raw_data_list[i].replace('hamsphire', 'hampshire')
    raw_data_list[i] = raw_data_list[i].replace('mississsippi', 'mississippi')
    raw_data_list[i] = raw_data_list[i].replace('cites', 'cities')

In [7]:
random.choice(raw_data_list)

"parse([how,many,people,live,in,the,biggest,city,in,new,york,state,?], answer(a,(population(b,a),largest(b,(city(b),loc(b,c),const(c,stateid('new york')),state(c))))))."

In [8]:
# split into natural language list and formula language list
sample_list, label_list = [], []

for pair in raw_data_list:
    split_pair = pair.split(', ')
    sample_list.append(split_pair[0])
    label_list.append(split_pair[1])

In [9]:
index = random.sample(list(range(len(sample_list))),1)[0]
print(sample_list[index])
print(label_list[index])

parse([what,are,the,major,rivers,in,ohio,?]
answer(a,(major(a),river(a),loc(a,b),const(b,stateid(ohio))))).


In [10]:
# prepreocess natural language list
nl_list = []
for i in range(data_size):
    sample = sample_list[i]
    for p in ['parse'] + list(en_puns):
        sample = sample.replace(p, ' ')
    sample = ' '.join([token for token in sample.split(' ') if len(token) > 0])
    nl_list.append(sample)

In [11]:
random.choice(nl_list)

'which is the longest river in usa'

In [12]:
# prepreocess formula language list
fl_list = []
valid_pun = ['\+', ')', '(', ',']
invalid_pun = ['answer(a,', ')).', "'"]
for i in range(data_size):
    label = label_list[i]
    for p in invalid_pun:
        label = label.replace(p, '')
    for p in valid_pun:
        label = label.replace(p, ' ' + p + ' ')
    label = ' '.join([token for token in label.split(' ') if len(token) > 0])
    fl_list.append(label)

# for label in label_list:
#     label = label.replace("('", '(')
#     label = label.replace("')", ')')
#     label = label.replace("',", ',')
#     label = label[:-1]
#     for p in pre_pun:
#         label = label.replace(p, ' ' + p)
#     for p in post_pun:
#         label = label.replace(p, p + ' ')
#     fl_list.append(label)

In [17]:
random.choice(fl_list)

'smallest ( b , ( state ( a ) , density ( a , b ) ) )'

In [18]:
index = random.sample(list(range(len(nl_list))),1)[0]
print(nl_list[index])
print(fl_list[index])

what is capital of iowa
( capital ( a ) , loc ( a , b ) , const ( b , stateid ( iowa ) ) )


In [19]:
# nl & fl tokenization
tk_nl_list, tk_fl_list = [], []

for i in range(len(nl_list)):
    tk_nl_list.append([nl for nl in nl_list[i].split(' ') if len(nl)>0])
    tk_fl_list.append([fl for fl in fl_list[i].split(' ') if len(fl)>0])

In [20]:
index = random.sample(list(range(len(tk_nl_list))),1)[0]
print(tk_nl_list[index])
print(tk_fl_list[index])

['how', 'many', 'rivers', 'are', 'in', 'the', 'state', 'with', 'the', 'highest', 'point']
['count', '(', 'b', ',', '(', 'river', '(', 'b', ')', ',', 'loc', '(', 'b', ',', 'c', ')', ',', 'state', '(', 'c', ')', ',', 'loc', '(', 'd', ',', 'c', ')', ',', 'highest', '(', 'd', ',', 'place', '(', 'd', ')', ')', ')', ',', 'a', ')']


In [21]:
# count tokens
nl_c = Counter()
for tk_nl in tk_nl_list:
    nl_c.update(tk_nl)

nl_token_freq_dict = dict(nl_c)

fl_c = Counter()
for tk_fl in tk_fl_list:
    fl_c.update(tk_fl)

fl_token_freq_dict = dict(fl_c)

In [22]:
print(len(nl_c))
print(nl_c.most_common())

281
[('the', 923), ('what', 563), ('is', 418), ('in', 347), ('state', 258), ('states', 256), ('of', 226), ('how', 153), ('which', 131), ('population', 129), ('are', 125), ('river', 118), ('many', 118), ('through', 105), ('largest', 92), ('border', 92), ('rivers', 90), ('point', 87), ('highest', 86), ('cities', 83), ('texas', 82), ('capital', 75), ('has', 74), ('city', 74), ('that', 69), ('with', 67), ('major', 61), ('most', 59), ('smallest', 51), ('run', 47), ('mississippi', 44), ('usa', 42), ('does', 41), ('people', 41), ('have', 40), ('longest', 38), ('density', 36), ('borders', 36), ('lowest', 36), ('area', 36), ('us', 35), ('new', 34), ('colorado', 32), ('live', 31), ('runs', 31), ('where', 26), ('biggest', 23), ('california', 21), ('alaska', 19), ('united', 19), ('austin', 18), ('bordering', 18), ('there', 17), ('me', 16), ('york', 16), ('missouri', 16), ('named', 16), ('elevation', 15), ('all', 14), ('a', 14), ('high', 13), ('long', 13), ('ohio', 13), ('shortest', 13), ('iowa', 1

In [23]:
print(len(fl_c))
print(fl_c.most_common())

172
[('(', 5029), (')', 5029), (',', 4488), ('b', 2159), ('a', 1715), ('const', 679), ('c', 614), ('loc', 496), ('state', 489), ('stateid', 404), ('river', 203), ('population', 167), ('next_to', 159), ('city', 151), ('largest', 144), ('traverse', 120), ('place', 103), ('cityid', 99), ('countryid', 96), ('usa', 96), ('d', 86), ('_', 85), ('capital', 81), ('texas', 79), ('highest', 79), ('count', 79), ('riverid', 71), ('major', 61), ('smallest', 61), ('mississippi', 44), ('density', 42), ('most', 42), ('longest', 40), ('area', 36), ('new', 34), ('colorado', 32), ('lowest', 32), ('len', 24), ('california', 21), ('alaska', 19), ('elevation', 18), ('austin', 18), ('york', 16), ('missouri', 15), ('\\+', 15), ('size', 14), ('ohio', 13), ('shortest', 13), ('iowa', 13), ('montana', 12), ('mountain', 12), ('mexico', 11), ('florida', 11), ('hawaii', 10), ('alabama', 9), ('placeid', 9), ('e', 8), ('pennsylvania', 8), ('wyoming', 8), ('washington', 8), ('lake', 7), ('utah', 7), ('mount', 7), ('rhod

In [24]:
# rare word
rare_token_list = [key for key in nl_token_freq_dict if nl_token_freq_dict[key] < 2]
print(len(rare_token_list))
print(rare_token_list)

78
['could', 'count', 'elevations', 'lower', 'guadalupe', 'miles', 'platte', 'montgomery', 'chicago', 'detroit', 'minneapolis', 'riverside', 'spokane', 'lived', 'reside', 'stay', 'residents', 'found', 'traverse', 'longer', 'or', 'much', 'exist', 'list', 'washed', 'names', 'densities', 'potomac', 'about', 'adjacent', 'american', 'continental', 'durham', 'death', 'sea', 'level', 'traversed', 'flowing', 'maximum', 'dense', 'ga', 'erie', 'tempe', 'tucson', 'over', 'though', 'west', 'those', 'salem', 'flint', 'columbus', 'miami', 'pittsburgh', 'dover', 'contain', 'go', 'plano', 'salt', 'lake', 'them', 'neighbor', 'surround', 'whats', 'baton', 'rouge', 'fort', 'wayne', 'indianapolis', 'orleans', 'diego', 'jose', 'scotts', 'spot', 'goes', 'fewest', 'adjoin', 'pass', 'lie']


In [26]:
# generate natural language vocabulary index dictionary
nl_vocab_dict = dict()
nl_vocab_dict['<s>'] = 0
nl_vocab_dict['</s>'] = 1
nl_vocab_dict['<pad>'] = 2
nl_vocab_dict['<unk>'] = 3
i = len(nl_vocab_dict)
for token in nl_token_freq_dict:
        nl_vocab_dict[token] = i
        i += 1
print(len(nl_vocab_dict))

285


In [27]:
# generate formula language vocabulary index dictionary
fl_vocab_dict = dict()
fl_vocab_dict['<s>'] = 0
fl_vocab_dict['</s>'] = 1
fl_vocab_dict['<pad>'] = 2
i = len(fl_vocab_dict)
for token in fl_token_freq_dict:
    fl_vocab_dict[token] = i
    i += 1
print(len(fl_vocab_dict))

175


In [21]:
# replace unknown word with <unk>
# for i in range(len(tk_nl_list)):
#     for j in range(len(tk_nl_list[i])):
#         if tk_nl_list[i][j] not in all_vocab_dict.keys():
#             tk_nl_list[i][j] = '<unk>'

In [28]:
# encoder inputs
encoder_inputs = []
for nl in tk_nl_list:
    encoder_inputs.append([nl_vocab_dict[token] for token in nl])

In [29]:
# decoder inputs
decoder_inputs = []
for fl in tk_fl_list:
    decoder_inputs.append([fl_vocab_dict[token] for token in fl])

In [30]:
# decoder targets
decoder_targets = []
for fl in tk_fl_list:
    decoder_targets.append([fl_vocab_dict[token] for token in fl])

In [31]:
encoder_inputs = np.array(encoder_inputs)
decoder_inputs = np.array(decoder_inputs)
decoder_targets = np.array(decoder_targets)

In [32]:
train_dict = dict()
test_dict = dict()

index = np.random.permutation(880)
index_list = np.split(index, [680, 880])

train_dict['encoder_inputs'] = encoder_inputs[index_list[0]].tolist()
train_dict['decoder_inputs'] = decoder_inputs[index_list[0]].tolist()
train_dict['decoder_targets'] = decoder_targets[index_list[0]].tolist()

test_dict['encoder_inputs'] = encoder_inputs[index_list[1]].tolist()
test_dict['decoder_inputs'] = decoder_inputs[index_list[1]].tolist()
test_dict['decoder_targets'] = decoder_targets[index_list[1]].tolist()

In [33]:
# output as json
pp_data = dict()
pp_data['train_dict'] = train_dict
pp_data['test_dict'] = test_dict

vocab_dict = dict()
vocab_dict['nl_vocab_dict'] = nl_vocab_dict
vocab_dict['fl_vocab_dict'] = fl_vocab_dict

with open(PP_DATA_PATH, 'w') as f:
    json.dump(pp_data, f, ensure_ascii=False)

with open(VOCAB_PATH, 'w') as f:
    json.dump(vocab_dict, f, ensure_ascii=False)