In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

In [1]:
import doctest
import sys
import json

sys.path.append("../../gts_test")
from src.train_and_evaluate import *
from src.models import *
from src.expressions_transfer import *

sys.path.append("..")
from libs.dataset_readers.text import preprocess_text
from libs.dataset_readers.equation import preprocess_equation
from libs.dataset_readers.equation_utils import infix_to_prefix

In [3]:
# Load the data
data = load_raw_data("../../gts_test/data/raw_original.json")

print(len(data))

Reading lines...
1398
time: 14.2 ms


---
## Transfer num

In [4]:
pairs, generate_nums, copy_nums = transfer_num(data)

print(data[0])
print(pairs[0])
print(generate_nums)
print(copy_nums)

Transfer numbers...
{'id': '1', 'original_text': '一辆压路机每分钟行驶50米，压路的宽度为3米．如果行驶压路机12分钟，可以压路多少平方米？', 'segmented_text': '一 辆 压路机 每分钟 行驶 50 米 ， 压路 的 宽度 为 3 米 ． 如果 行驶 压路机 12 分钟 ， 可以 压路 多少 平方米 ？', 'equation': 'x=50*12*3', 'ans': '1800'}
(['一', '辆', '压路机', '每分钟', '行驶', 'NUM', '米', '，', '压路', '的', '宽度', '为', 'NUM', '米', '．', '如果', '行驶', '压路机', 'NUM', '分钟', '，', '可以', '压路', '多少', '平方米', '？'], ['N0', '*', 'N2', '*', 'N1'], ['50', '3', '12'], [5, 12, 18])
['3.14', '1']
8
time: 135 ms


In [5]:
all_tokenized_text = []
all_numbers = []
all_positions = []
all_tokenized_equation = []

for problem in data:
    text = problem["segmented_text"]
    equation = problem["equation"][2:]
    problem_tokens, numbers, positions = preprocess_text(
            text, "NUM")
    equation_tokens = preprocess_equation(equation, numbers)
    all_tokenized_text.append(problem_tokens)
    all_numbers.append(numbers)
    all_positions.append(positions)
    all_tokenized_equation.append(equation_tokens)

time: 88.6 ms


In [6]:
for idx in range(len(data)):
#     print(pairs[idx][1])
#     print(all_tokenized_equation[idx])
    assert pairs[idx][0] == all_tokenized_text[idx]
    assert pairs[idx][1] == all_tokenized_equation[idx]        
    assert pairs[idx][2] == all_numbers[idx]    
    assert pairs[idx][3] == all_positions[idx]  


AssertionError: 

time: 383 ms


In [7]:
all_tokenized_equation[0]

['<N0>', '*', '<N2>', '*', '<N1>']

time: 5.03 ms


---
## Infix to prefix

In [8]:
temp_pairs = []
for p in pairs:
    temp_pairs.append((p[0], from_infix_to_prefix(p[1]), p[2], p[3]))
pairs = temp_pairs

time: 16.7 ms


In [9]:
all_tokenized_equation = [infix_to_prefix(x) for x in all_tokenized_equation]

time: 13 ms


In [10]:
for idx in range(len(data)):
    assert pairs[idx][0] == all_tokenized_text[idx]
    assert pairs[idx][1] == all_tokenized_equation[idx]        
    assert pairs[idx][2] == all_numbers[idx]    
    assert pairs[idx][3] == all_positions[idx]  


AssertionError: 

time: 15.8 ms


---
## Create 5-fold

In [11]:
fold_size = int(len(pairs) * 0.2)
fold_pairs = []
for split_fold in range(4):
    fold_start = fold_size * split_fold
    fold_end = fold_size * (split_fold + 1)
    fold_pairs.append(pairs[fold_start:fold_end])
fold_pairs.append(pairs[(fold_size * 4):])

time: 1.78 ms


---
## Create vocab

In [12]:
def read_vocab(path):
    tokens: List[str] = []
    
    with open(path, 'r') as f:
        for line in f.readlines():
            token = line.rstrip()
            tokens.append(token)
    return set(tokens)

time: 1.9 ms


In [13]:
# Vocab from 
fold = 0
pairs_tested = []
pairs_trained = []
for fold_t in range(5):
    if fold_t == fold:
        pairs_tested += fold_pairs[fold_t]
    else:
        pairs_trained += fold_pairs[fold_t]

input_lang, output_lang, train_pairs, test_pairs = prepare_data(pairs_trained, pairs_tested, 5, generate_nums,
                                                                copy_nums, tree=True)

Indexing words...
keep_words 587 / 2009 = 0.2922
Indexed 590 words in input language, 16 words in output
Number of training data 1119
Number of testind data 279
time: 277 ms


In [14]:
print(input_lang.n_words)
print(output_lang.n_words)
# print(input_lang.word2index)
# print(input_lang.index2word)
print(output_lang.index2word)

590
16
['/', '*', '+', '-', '^', '3.14', '1', 'N0', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'UNK']
time: 2.28 ms


In [16]:
input_vocab1 = set(input_lang.index2word)
input_vocab2 = read_vocab("../results/seq2tree/check_loss/fold0/vocabulary/tokens.txt")
print(input_vocab1.difference(input_vocab2))
print(input_vocab2.difference(input_vocab1))

{'UNK', 'NUM', '沙', 'PAD'}
{'@@UNKNOWN@@', '<NUM>'}
time: 2.79 ms


In [17]:
for idx, i in enumerate(train_pairs[:200]):
    if i[-1]:
        print(idx, i)
#         break

101 ([3, 133, 272, 2, 6, 1, 255, 200, 2, 122, 2, 31, 2, 6, 199, 129, 1, 19, 7, 155, 351, 47, 2, 352, 26], 25, [1, 15, 15], 3, ['4', '4'], [5, 16], [[0, 1], [0, 1]])
133 ([2, 2, 7, 182, 136, 17, 77, 129, 1, 120, 6, 74, 2, 7, 16, 100, 6, 1, 255, 224, 1, 120, 391, 123, 340, 230, 2, 31, 210, 391, 136, 25, 368, 2, 31], 35, [0, 7, 15], 3, ['48', '4', '4'], [8, 17, 20], [[1, 2]])
175 ([3, 400, 401, 6, 18, 9, 1, 101, 7, 20, 9, 1, 101, 7, 99, 9, 1, 101, 31, 245, 203, 401, 6, 2, 106, 1, 184, 432, 7, 2, 2, 106, 2, 96, 1, 101, 98, 1, 7, 203, 6, 96, 37, 25, 38, 26], 46, [3, 2, 2, 1, 7, 8, 1, 1, 7, 15, 12, 1, 1, 8, 15, 12, 11], 17, ['10', '8', '4', '4', '25', '2'], [6, 11, 16, 25, 34, 37], [[2, 3], [2, 3]])
time: 3.9 ms


In [18]:
a = [1, 2]
a.reverse()
a

[2, 1]

time: 2.78 ms


---
## Doctesting

In [19]:
doctest.run_docstring_examples(preprocess_text, globals())

time: 10.8 ms


In [20]:
doctest.run_docstring_examples(preprocess_equation, globals())

time: 2.53 ms
