In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

In [2]:
import doctest
import sys
import json

from allennlp.nn import util as nn_util
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.samplers import BucketBatchSampler, MaxTokensBatchSampler

sys.path.append("../../gts_test")
from src.train_and_evaluate import *
from src.models import *
from src.expressions_transfer import *

sys.path.append("..")
from libs.dataset_readers.math23k_reader import Math23kReader
from libs.dataset_readers.math23k_transformer_reader import Math23kTransformerReader



time: 4.84 s


In [3]:
def read_vocab(path):
    tokens: List[str] = []
    
    with open(path, 'r') as f:
        for line in f.readlines():
            token = line.rstrip()
            tokens.append(token)
    return set(tokens)

time: 1.84 ms


In [4]:
# Load the data
data = load_raw_data("../../gts_test/data/raw_original.json")

print(len(data))

Reading lines...
1398
time: 16.4 ms


---
## GTS part

In [5]:
pairs, generate_nums, copy_nums = transfer_num(data)

temp_pairs = []
for p in pairs:
    temp_pairs.append((p[0], from_infix_to_prefix(p[1]), p[2], p[3]))
pairs = temp_pairs

pairs_tested = pairs[:280]
pairs_trained = pairs[280:]

input_lang, output_lang, train_pairs, test_pairs = prepare_data(pairs_trained, pairs_tested, 5, generate_nums,
                                                                copy_nums, tree=True)


batch_size = 32

input_batches, input_lengths, output_batches, output_lengths, nums_batches, num_stack_batches, num_pos_batches, num_size_batches\
 = prepare_train_batch(train_pairs, batch_size)

Transfer numbers...
Indexing words...
keep_words 586 / 2006 = 0.2921
Indexed 589 words in input language, 16 words in output
Number of training data 1118
Number of testind data 280
time: 690 ms


In [6]:
print(train_pairs[0])
print(pairs_trained[0])
print(len(train_pairs))

([3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 1, 9, 13, 14, 4, 5, 15, 16, 17, 18, 6, 19, 20, 21, 22, 23], 27, [0, 0, 5, 8, 7], 5, ['1.6', '0.5'], [6, 11], [])
(['一', '台', '压路机', '的', '滚筒', '长', 'NUM', '米', '，', '直径', '是', 'NUM', '米', '．', '这', '台', '压路机', '滚动', '一周', '压', '过', '的', '路面', '=', '多少', '平方米', '？'], ['*', '*', '3.14', 'N1', 'N0'], ['1.6', '0.5'], [6, 11])
1118
time: 2.35 ms


In [7]:
print(output_lang.index2word)

['*', '/', '+', '-', '^', '3.14', '1', 'N0', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'UNK']
time: 1.45 ms


---
## Normal reader

In [16]:
path = "../data/geometry_5fold_raw/fold0_train.json"

reader = Math23kReader(num_token_type="NUM")
dataset = list(reader.read("../data/geometry_5fold_raw/fold0_train.json"))
print(len(dataset))
print(dataset[0]["source_tokens"])
print(dataset[0])

1118
TextField of length 27 with text: 
 		[一, 台, 压路机, 的, 滚筒, 长, <NUM>, 米, ，, 直径, 是, <NUM>, 米, ．, 这, 台, 压路机, 滚动, 一周, 压, 过, 的, 路面, =, 多少, 平方米,
		？]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
Instance with fields:
 	 source_tokens: TextField of length 27 with text: 
 		[一, 台, 压路机, 的, 滚筒, 长, <NUM>, 米, ，, 直径, 是, <NUM>, 米, ．, 这, 台, 压路机, 滚动, 一周, 压, 过, 的, 路面, =, 多少, 平方米,
		？]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 target_tokens: TextField of length 5 with text: 
 		[*, *, 3.14, <N1>, <N0>]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 metadata: MetadataField (print field.metadata to see specific information). 

time: 176 ms


In [9]:
# Check vocab

# GTS vocab
GTS_vocab = set(input_lang.index2word)

# Allennlp Vocab
vocab = Vocabulary.from_instances(dataset, min_count={"tokens":5}, pretrained_files={
           "target_vocab": "../extra_files/equation_vocab.txt"}, only_include_pretrained_words= True)
allennlp_vocab = set(vocab.get_token_to_index_vocabulary(namespace='tokens'))

# File vocab
file_vocab = read_vocab("../results/seq2tree/check_loss/fold0/vocabulary/tokens.txt")

# Diff
print(GTS_vocab.difference(allennlp_vocab))
print(allennlp_vocab.difference(GTS_vocab))
print(allennlp_vocab.difference(file_vocab))
print(file_vocab.difference(allennlp_vocab))
print(GTS_vocab.difference(file_vocab))
print(file_vocab.difference(GTS_vocab))

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=1118.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

{'UNK', 'PAD', 'NUM'}
{'<NUM>', '@@PADDING@@', '@@UNKNOWN@@'}
{'@@PADDING@@'}
set()
{'UNK', 'PAD', 'NUM'}
{'<NUM>', '@@UNKNOWN@@'}
time: 97.8 ms





In [10]:
print("Default:")
data_loader = MultiProcessDataLoader(reader, path, shuffle=False, batch_size=1)
data_loader.index_with(vocab)
allennlp_data = list(data_loader)

print(len(allennlp_data))

Default:


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='loading instances', max=1.0, style=Prog…

1118
time: 537 ms





In [11]:
with open("../extra_files/id_mapping_arithmetic.json","r") as f:
    mapp = json.load(f)

time: 1.75 ms


In [30]:
def generate_num_stack(metadata):

    num_stack_batch = []
    for prob_metadata in metadata:
        num_stack = []
        for word in prob_metadata["target_tokens"]:
            temp_num = []
            flag_not = True
            if (vocab.get_token_index(word, "target_vocab")
                    == vocab.get_token_index("@@UNKNOWN@@", "target_vocab")):
                flag_not = False
                for i, j in enumerate(prob_metadata["numbers"]):
                    if j == word:
                        temp_num.append(i)

            if not flag_not and len(temp_num) != 0:
                num_stack.append(temp_num)
            if not flag_not and len(temp_num) == 0:
                num_stack.append(
                    [_ for _ in range(len(prob_metadata["numbers"]))])
        num_stack.reverse()
        num_stack_batch.append(num_stack)
    return num_stack_batch

time: 2.96 ms


In [34]:
# 0. No shuffle
# 1. Input length
# 2. Convert to ids
# 3. Numbers, positions, num_stack

for i, j in zip(train_pairs[:], allennlp_data[:]):
    
    # Compare the text
    gts_text = " ".join([input_lang.index2word[x] for x in i[0]])
    gts_length = i[1]
    allen_text = [vocab.get_token_from_index(x.item()) for x in j["source_tokens"]["tokens"]["tokens"][0]]
    allen_length = len(allen_text)
    allen_text = " ".join(allen_text)
    allen_text = allen_text.replace("<NUM>", "NUM").replace("@@UNKNOWN@@", "UNK")
    assert gts_text == allen_text
    assert gts_length == allen_length
    
    # Compare the equation
    gts_equation_ids = i[2]
    allen_equation_ids = j["target_tokens"]["tokens"]["tokens"][0]
    allen_equation_ids = [mapp[vocab.get_token_from_index(x.item(), "target_vocab")] for x in allen_equation_ids]
    assert gts_equation_ids == allen_equation_ids

    # Compare the numbers and positions
    gts_numbers = i[4]
    gts_positions = i[5]
    allen_numbers = j["metadata"][0]["numbers"]
    allen_positions = j["metadata"][0]["positions"]
    assert gts_numbers == allen_numbers
    assert gts_positions == allen_positions
    
    # Compare the num stack
    gts_num_stack = i[6]
    allen_copy_positions = generate_num_stack(j["metadata"])[0]
    assert gts_num_stack == allen_copy_positions
    
    
#     print(gts_text )
#     print(allen_text)
#     print(i, j)

time: 102 ms


---
## Bert reader

In [3]:
# path = "../data/geometry_5fold_raw/fold0_train.json"

# reader = Math23kTransformerReader(num_token_type="NUM")
# dataset = list(reader.read("../data/geometry_5fold_raw/fold0_train.json"))
# print(len(dataset))
# print(dataset[0])
# print(dataset[0]["metadata"].human_readable_repr())

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


1118
Instance with fields:
 	 source_tokens: TextField of length 41 with text: 
 		[[CLS], 一, 台, 压, 路, 机, 的, 滚, 筒, 长, <NUM>, 米, ，, 直, 径, 是, <NUM>, 米, ．, 这, 台, 压, 路, 机, 滚, 动, 一, 周, 压,
		过, 的, 路, 面, =, 多, 少, 平, 方, 米, ？, [SEP]]
 		and TokenIndexers : {'tokens': 'PretrainedTransformerIndexer'} 
 	 target_tokens: TextField of length 5 with text: 
 		[*, *, 3.14, <N1>, <N0>]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 metadata: MetadataField (print field.metadata to see specific information). 

{'id': '281', 'problem': '一 台 压路机 的 滚筒 长 1.6 米 ， 直径 是 0.5 米 ． 这 台 压路机 滚动 一周 压 过 的 路面 = 多少 平方米 ？', 'equation': 'x=3.14*0.5*1.6', 'answer': '2.512', 'numbers': ['1.6', '0.5'], 'positions': [10, 16], 'source_tokens': ['一', '台', '压路机', '的', '滚筒', '长', '<NUM>', '米', '，', '直径', '是', '<NUM>', '米', '．', '这', '台', '压路机', '滚动', '一周', '压', '过', '的', '路面', '=', '多少', '平方米', '？'], 'target_tokens': ['*', '*', '3.14', '<N1>', '<N0>']}
time: 16.7 s
