In [1]:
# coding: utf-8
from src.train_and_evaluate import *
from src.models import *
import time
import torch.optim
from src.expressions_transfer import *
import json
import numpy as np

def read_json(path):
    with open(path,'r') as f:
        file = json.load(f)
    return file

def write_json(path,file):
    with open(path,'w') as f:
        json.dump(file,f)
        

# Data Preprocessing

In [4]:
def get_train_test_fold(ori_path,prefix,data,pairs,group):
    mode_train = 'train'
    mode_valid = 'valid'
    mode_test = 'test'
    train_path = ori_path + mode_train + prefix
    valid_path = ori_path + mode_valid + prefix
    test_path = ori_path + mode_test + prefix
    train = read_json(train_path)
    train_id = [item['id'] for item in train]
    valid = read_json(valid_path)
    valid_id = [item['id'] for item in valid]
    test = read_json(test_path)
    test_id = [item['id'] for item in test]
    train_fold = []
    valid_fold = []
    test_fold = []
    for item,pair,g in zip(data, pairs, group):
        pair = list(pair)
        pair.append(g['group_num'])
        pair = tuple(pair)
        if item['id'] in train_id:
            train_fold.append(pair)
        elif item['id'] in test_id:
            test_fold.append(pair)
        else:
            valid_fold.append(pair)
    return train_fold, test_fold, valid_fold

In [5]:
def load_raw_data(filename):  # load the json data to list(dict()) for MATH 23K
    print("Reading lines...")
    f = open(filename, encoding="utf-8")
    js = ""
    data = []
    for i, s in enumerate(f):
        js += s
        i += 1
        if i % 7 == 0:  # every 7 line is a json
            data_d = json.loads(js)
            if "千米/小时" in data_d["equation"]:
                data_d["equation"] = data_d["equation"][:-5]
            data.append(data_d)
            js = ""

    return data

In [10]:
def prepare_data(pairs_trained, pairs_tested, trim_min_count, generate_nums, copy_nums,group_train, group_test, tree=False):
    input_lang = Lang()
    output_lang = Lang()
    train_pairs = []
    test_pairs = []

    print("Indexing words...")
    for pair in pairs_trained:
        if not tree:
            input_lang.add_sen_to_vocab(pair[0])
            output_lang.add_sen_to_vocab(pair[1])
        elif pair[-1]:
            input_lang.add_sen_to_vocab(pair[0])
            output_lang.add_sen_to_vocab(pair[1])
    input_lang.build_input_lang(trim_min_count)
    if tree:
        output_lang.build_output_lang_for_tree(generate_nums, copy_nums)
    else:
        output_lang.build_output_lang(generate_nums, copy_nums)

    for pair in pairs_trained:
        num_stack = []
        for word in pair[1]:
            temp_num = []
            flag_not = True
            if word not in output_lang.index2word:
                flag_not = False
                for i, j in enumerate(pair[2]):
                    if j == word:
                        temp_num.append(i)

            if not flag_not and len(temp_num) != 0:
                num_stack.append(temp_num)
            if not flag_not and len(temp_num) == 0:
                num_stack.append([_ for _ in range(len(pair[2]))])

        num_stack.reverse()
        input_cell = indexes_from_sentence(input_lang, pair[0])
        output_cell = indexes_from_sentence(output_lang, pair[1], tree)
        # train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
        #                     pair[2], pair[3], num_stack, pair[4]))
        train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
                            pair[2], pair[3], num_stack))
    print('Indexed %d words in input language, %d words in output' % (input_lang.n_words, output_lang.n_words))
    print('Number of training data %d' % (len(train_pairs)))
    for pair in pairs_tested:
        num_stack = []
        for word in pair[1]:
            temp_num = []
            flag_not = True
            if word not in output_lang.index2word:
                flag_not = False
                for i, j in enumerate(pair[2]):
                    if j == word:
                        temp_num.append(i)

            if not flag_not and len(temp_num) != 0:
                num_stack.append(temp_num)
            if not flag_not and len(temp_num) == 0:
                num_stack.append([_ for _ in range(len(pair[2]))])

        num_stack.reverse()
        input_cell = indexes_from_sentence(input_lang, pair[0])
        output_cell = indexes_from_sentence(output_lang, pair[1], tree)
        # train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
        #                     pair[2], pair[3], num_stack, pair[4]))
        test_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
                           pair[2], pair[3], num_stack))
    print('Number of testind data %d' % (len(test_pairs)))
    return input_lang, output_lang, train_pairs, test_pairs

In [49]:
def change_num(num):
    new_num = []
    for item in num:
        if '/' in item:
            new_str = item.split(')')[0]
            new_str = new_str.split('(')[1]
            a = float(new_str.split('/')[0])
            b = float(new_str.split('/')[1])
            value = a/b
            new_num.append(value)
        elif '%' in item:
            value = float(item[0:-1])/100
            new_num.append(value)
        else:
            new_num.append(float(item))
    return new_num

[1.0, 0.5, 0.02, 2.0]


In [50]:
batch_size = 64
embedding_size = 128
hidden_size = 512
n_epochs = 80
learning_rate = 1e-3
weight_decay = 1e-5
beam_size = 5
n_layers = 2
ori_path = '../graph_quantity_multigraph_trans/data/'
prefix = '23k_processed.json'


data = load_raw_data("data/Math_23K.json")
group_data = read_json("data/Math_23K_processed.json")

data = load_raw_data("data/Math_23K.json")

pairs, generate_nums, copy_nums = transfer_num(data)

temp_pairs = []
for p in pairs:
    temp_pairs.append((p[0], from_infix_to_prefix(p[1]), change_num(p[2]), p[3]))
pairs = temp_pairs

train_fold, test_fold, valid_fold = get_train_test_fold(ori_path,prefix,data,pairs,group_data)


best_acc_fold = []

pairs_tested = test_fold
pairs_trained = train_fold


Reading lines...
Reading lines...
Transfer numbers...


In [51]:
def prepare_data(pairs_trained, pairs_tested, trim_min_count, generate_nums, copy_nums, tree=False):
    input_lang = Lang()
    output_lang = Lang()
    train_pairs = []
    test_pairs = []

    print("Indexing words...")
    for pair in pairs_trained:
        if not tree:
            input_lang.add_sen_to_vocab(pair[0])
            output_lang.add_sen_to_vocab(pair[1])
        elif pair[-1]:
            input_lang.add_sen_to_vocab(pair[0])
            output_lang.add_sen_to_vocab(pair[1])
    input_lang.build_input_lang(trim_min_count)
    if tree:
        output_lang.build_output_lang_for_tree(generate_nums, copy_nums)
    else:
        output_lang.build_output_lang(generate_nums, copy_nums)

    for pair in pairs_trained:
        num_stack = []
        for word in pair[1]:
            temp_num = []
            flag_not = True
            if word not in output_lang.index2word:
                flag_not = False
                for i, j in enumerate(pair[2]):
                    if j == word:
                        temp_num.append(i)

            if not flag_not and len(temp_num) != 0:
                num_stack.append(temp_num)
            if not flag_not and len(temp_num) == 0:
                num_stack.append([_ for _ in range(len(pair[2]))])

        num_stack.reverse()
        input_cell = indexes_from_sentence(input_lang, pair[0])
        output_cell = indexes_from_sentence(output_lang, pair[1], tree)
        # train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
        #                     pair[2], pair[3], num_stack, pair[4]))
        train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
                            pair[2], pair[3], num_stack, pair[4]))
    print('Indexed %d words in input language, %d words in output' % (input_lang.n_words, output_lang.n_words))
    print('Number of training data %d' % (len(train_pairs)))
    for pair in pairs_tested:
        num_stack = []
        for word in pair[1]:
            temp_num = []
            flag_not = True
            if word not in output_lang.index2word:
                flag_not = False
                for i, j in enumerate(pair[2]):
                    if j == word:
                        temp_num.append(i)

            if not flag_not and len(temp_num) != 0:
                num_stack.append(temp_num)
            if not flag_not and len(temp_num) == 0:
                num_stack.append([_ for _ in range(len(pair[2]))])

        num_stack.reverse()
        input_cell = indexes_from_sentence(input_lang, pair[0])
        output_cell = indexes_from_sentence(output_lang, pair[1], tree)
        # train_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
        #                     pair[2], pair[3], num_stack, pair[4]))
        test_pairs.append((input_cell, len(input_cell), output_cell, len(output_cell),
                           pair[2], pair[3], num_stack,pair[4]))
    print('Number of testind data %d' % (len(test_pairs)))
    return input_lang, output_lang, train_pairs, test_pairs
input_lang, output_lang, train_pairs, test_pairs = prepare_data(pairs_trained, pairs_tested, 5, generate_nums,
                                                                copy_nums, tree=True)

Indexing words...
keep_words 3928 / 10543 = 0.3726
Indexed 3931 words in input language, 23 words in output
Number of training data 21162
Number of testind data 1000


In [13]:
def prepare_train_batch(pairs_to_batch, batch_size):
    pairs = copy.deepcopy(pairs_to_batch)
    random.shuffle(pairs)  # shuffle the pairs
    pos = 0
    input_lengths = []
    output_lengths = []
    nums_batches = []
    batches = []
    input_batches = []
    output_batches = []
    num_stack_batches = []  # save the num stack which
    num_pos_batches = []
    num_size_batches = []
    group_batches = []
    num_value_batches = []
    while pos + batch_size < len(pairs):
        batches.append(pairs[pos:pos+batch_size])
        pos += batch_size
    batches.append(pairs[pos:])

    for batch in batches:
        batch = sorted(batch, key=lambda tp: tp[1], reverse=True)
        input_length = []
        output_length = []
        for _, i, _, j, _, _, _,_ in batch:
            input_length.append(i)
            output_length.append(j)
        input_lengths.append(input_length)
        output_lengths.append(output_length)
        input_len_max = input_length[0]
        output_len_max = max(output_length)
        input_batch = []
        output_batch = []
        num_batch = []
        num_stack_batch = []
        num_pos_batch = []
        num_size_batch = []
        group_batch = []
        num_value_batch = []
        for i, li, j, lj, num, num_pos, num_stack, group in batch:
            num_batch.append(len(num))
            input_batch.append(pad_seq(i, li, input_len_max))
            output_batch.append(pad_seq(j, lj, output_len_max))
            num_stack_batch.append(num_stack)
            num_pos_batch.append(num_pos)
            num_size_batch.append(len(num_pos))
            num_value_batch.append(num)
            group_batch.append(group)
        input_batches.append(input_batch)
        nums_batches.append(num_batch)
        output_batches.append(output_batch)
        num_stack_batches.append(num_stack_batch)
        num_pos_batches.append(num_pos_batch)
        num_size_batches.append(num_size_batch)
        num_value_batches.append(num_value_batch)
        group_batches.append(group_batch)
        
    return input_batches, input_lengths, output_batches, output_lengths, nums_batches, num_stack_batches, num_pos_batches, num_size_batches, num_value_batches, group_batches

In [52]:
input_batches, input_lengths, output_batches, output_lengths, nums_batches, \
   num_stack_batches, num_pos_batches, num_size_batches, num_value_batches, group_batches = prepare_train_batch(train_pairs, batch_size)

# Build and Test Model 

## Build Model and Optimizers

In [53]:
# Initialize models
encoder = EncoderSeq(input_size=input_lang.n_words, embedding_size=embedding_size, hidden_size=hidden_size,
                     n_layers=n_layers)
predict = Prediction(hidden_size=hidden_size, op_nums=output_lang.n_words - copy_nums - 1 - len(generate_nums),
                     input_size=len(generate_nums))
generate = GenerateNode(hidden_size=hidden_size, op_nums=output_lang.n_words - copy_nums - 1 - len(generate_nums),
                        embedding_size=embedding_size)
merge = Merge(hidden_size=hidden_size, embedding_size=embedding_size)
# the embedding layer is  only for generated number embeddings, operators, and paddings

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
predict_optimizer = torch.optim.Adam(predict.parameters(), lr=learning_rate, weight_decay=weight_decay)
generate_optimizer = torch.optim.Adam(generate.parameters(), lr=learning_rate, weight_decay=weight_decay)
merge_optimizer = torch.optim.Adam(merge.parameters(), lr=learning_rate, weight_decay=weight_decay)

encoder_scheduler = torch.optim.lr_scheduler.StepLR(encoder_optimizer, step_size=20, gamma=0.5)
predict_scheduler = torch.optim.lr_scheduler.StepLR(predict_optimizer, step_size=20, gamma=0.5)
generate_scheduler = torch.optim.lr_scheduler.StepLR(generate_optimizer, step_size=20, gamma=0.5)
merge_scheduler = torch.optim.lr_scheduler.StepLR(merge_optimizer, step_size=20, gamma=0.5)

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    predict.cuda()
    generate.cuda()
    merge.cuda()

generate_num_ids = []
for num in generate_nums:
    generate_num_ids.append(output_lang.word2index[num])

In [16]:
#for epoch in range(n_epochs):
encoder_scheduler.step()
predict_scheduler.step()
generate_scheduler.step()
merge_scheduler.step()
loss_total = 0

## Add Sample Data 

In [54]:
idx = 0
(input_batch, input_length, target_batch, target_length, nums_stack_batch, num_size_batch, generate_nums,
               encoder, predict, generate, merge, encoder_optimizer, predict_optimizer, generate_optimizer,
               merge_optimizer, output_lang, num_pos, num_value, group) = (input_batches[idx], input_lengths[idx], output_batches[idx], output_lengths[idx],
    num_stack_batches[idx], num_size_batches[idx], generate_num_ids, encoder, predict, generate, merge,
    encoder_optimizer, predict_optimizer, generate_optimizer, merge_optimizer, output_lang, num_pos_batches[idx], num_value_batches[idx], group_batches[idx])
english=False

In [55]:
print(input_length)

[51, 50, 47, 46, 45, 44, 42, 42, 41, 39, 36, 33, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 29, 29, 27, 27, 27, 26, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 18, 17, 17, 17, 17, 15, 15, 11, 11, 10, 10, 9]


## Train_tree Function

In [56]:
# sequence mask for attention
seq_mask = []
max_len = max(input_length)
for i in input_length:
    seq_mask.append([0 for _ in range(i)] + [1 for _ in range(i, max_len)])
seq_mask = torch.ByteTensor(seq_mask)

In [57]:
num_mask = []
max_num_size = max(num_size_batch) + len(generate_nums)
for i in num_size_batch:
    d = i + len(generate_nums)
    num_mask.append([0] * d + [1] * (max_num_size - d))
num_mask = torch.ByteTensor(num_mask)

In [58]:
unk = output_lang.word2index["UNK"]

# Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
input_var = torch.LongTensor(input_batch).transpose(0, 1)

target = torch.LongTensor(target_batch).transpose(0, 1)

padding_hidden = torch.FloatTensor([0.0 for _ in range(predict.hidden_size)]).unsqueeze(0)
batch_size = len(input_length)

In [59]:
encoder.train()
predict.train()
generate.train()
merge.train()

if USE_CUDA:
    input_var = input_var.cuda()
    seq_mask = seq_mask.cuda()
    padding_hidden = padding_hidden.cuda()
    num_mask = num_mask.cuda()

# Zero gradients of both optimizers
encoder_optimizer.zero_grad()
predict_optimizer.zero_grad()
generate_optimizer.zero_grad()
merge_optimizer.zero_grad()
# Run words through encoder

encoder_outputs, problem_output = encoder(input_var, input_length)

## Get Graph

In [91]:
# num net graph
def get_lower_num_graph(max_len, sentence_length, num_list, id_num_list,contain_zh_flag=True):
    diag_ele = np.zeros(max_len)
    for i in range(sentence_length):
        diag_ele[i] = 1
    graph = np.diag(diag_ele)
    if not contain_zh_flag:
        return graph
    for i in range(len(id_num_list)):
        for j in range(len(id_num_list)):
            if float(num_list[i]) <= float(num_list[j]):
                graph[id_num_list[i]][id_num_list[j]] = 1
            else:
                graph[id_num_list[j]][id_num_list[i]] = 1
    return graph

def get_greater_num_graph(max_len, sentence_length, num_list, id_num_list,contain_zh_flag=True):
    diag_ele = np.zeros(max_len)
    for i in range(sentence_length):
        diag_ele[i] = 1
    graph = np.diag(diag_ele)
    if not contain_zh_flag:
        return graph
    for i in range(len(id_num_list)):
        for j in range(len(id_num_list)):
            if float(num_list[i]) > float(num_list[j]):
                graph[id_num_list[i]][id_num_list[j]] = 1
            else:
                graph[id_num_list[j]][id_num_list[i]] = 1
    return graph

# quantity cell graph
def get_quantity_graph(max_len, sentence_length, quantity_cell_list,contain_zh_flag=True):
    diag_ele = np.zeros(max_len)
    for i in range(sentence_length):
        diag_ele[i] = 1
    graph = np.diag(diag_ele)
    if not contain_zh_flag:
        return graph
    for i in quantity_cell_list:
        for j in quantity_cell_list:
            graph[i][j] = 1
            graph[j][i] = 1
    return graph

In [62]:
batch_graph = []
max_len = max(input_length)
for i in range(len(input_length)):
    sentence_length = input_length[i]
    quantity_cell_list = group[i]
    num_list = num_value[i]
    id_num_list = num_pos[i]
    graph_newc = get_quantity_graph(max_len, sentence_length, quantity_cell_list)
    graph_greater = get_greater_num_graph(max_len, sentence_length, num_list, id_num_list)
    graph_lower = get_greater_num_graph(max_len, sentence_length, num_list, id_num_list)
    graph_total = [graph_newc.tolist(),graph_greater.tolist(),graph_lower.tolist()]
    batch_graph.append(graph_total)
batch_graph = np.array(batch_graph)
batch_graph = torch.LongTensor(batch_graph)

## Graph Computing

In [68]:
hidden_size = 512
gcn = Graph_Module(hidden_size, hidden_size, hidden_size).cuda()

In [72]:
print(encoder_outputs.shape)

torch.Size([51, 64, 512])


In [73]:
print(batch_graph.shape)

torch.Size([64, 3, 51, 51])


In [74]:
print(encoder_outputs.transpose(0, 1).shape)

torch.Size([64, 51, 512])


In [87]:
# Graph Module
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff,d_out, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_out)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Graph_Module(nn.Module):
    def __init__(self, indim, hiddim, outdim, dropout=0.3):
        super(Graph_Module, self).__init__()
        '''
        ## Variables:
        - indim: dimensionality of input node features
        - hiddim: dimensionality of the joint hidden embedding
        - outdim: dimensionality of the output node features
        - combined_feature_dim: dimensionality of the joint hidden embedding for graph
        - K: number of graph nodes/objects on the image
        '''
        self.in_dim = indim
        #self.combined_dim = outdim
        
        #self.edge_layer_1 = nn.Linear(indim, outdim)
        #self.edge_layer_2 = nn.Linear(outdim, outdim)
        
        #self.dropout = nn.Dropout(p=dropout)
        #self.edge_layer_1 = nn.utils.weight_norm(self.edge_layer_1)
        #self.edge_layer_2 = nn.utils.weight_norm(self.edge_layer_2)
        self.h = 4
        self.d_k = outdim//self.h
        
        #layer = GCN(indim, hiddim, self.d_k, dropout)
        self.graph = clones(GCN(indim, hiddim, self.d_k, dropout), 4)
        
        #self.Graph_0 = GCN(indim, hiddim, outdim//4, dropout)
        #self.Graph_1 = GCN(indim, hiddim, outdim//4, dropout)
        #self.Graph_2 = GCN(indim, hiddim, outdim//4, dropout)
        #self.Graph_3 = GCN(indim, hiddim, outdim//4, dropout)
        
        self.feed_foward = PositionwiseFeedForward(indim, hiddim, outdim, dropout)
        self.norm = LayerNorm(outdim)

    def get_adj(self, graph_nodes):
        '''
        ## Inputs:
        - graph_nodes (batch_size, K, in_feat_dim): input features
        ## Returns:
        - adjacency matrix (batch_size, K, K)
        '''
        self.K = graph_nodes.size(1)
        graph_nodes = graph_nodes.contiguous().view(-1, self.in_dim)
        
        # layer 1
        h = self.edge_layer_1(graph_nodes)
        h = F.relu(h)
        
        # layer 2
        h = self.edge_layer_2(h)
        h = F.relu(h)

        # outer product
        h = h.view(-1, self.K, self.combined_dim)
        adjacency_matrix = torch.matmul(h, h.transpose(1, 2))
        
        adjacency_matrix = self.b_normal(adjacency_matrix)

        return adjacency_matrix
    
    def normalize(self, A, symmetric=True):
        '''
        ## Inputs:
        - adjacency matrix (K, K) : A
        ## Returns:
        - adjacency matrix (K, K) 
        '''
        A = A + torch.eye(A.size(0)).cuda().float()
        d = A.sum(1)
        if symmetric:
            # D = D^{-1/2}
            D = torch.diag(torch.pow(d, -0.5))
            return D.mm(A).mm(D)
        else :
            D = torch.diag(torch.pow(d,-1))
            return D.mm(A)
       
    def b_normal(self, adj):
        batch = adj.size(0)
        for i in range(batch):
            adj[i] = self.normalize(adj[i])
        return adj

    def forward(self, graph_nodes, graph):
        '''
        ## Inputs:
        - graph_nodes (batch_size, K, in_feat_dim): input features
        ## Returns:
        - graph_encode_features (batch_size, K, out_feat_dim)
        '''
        nbatches = graph_nodes.size(0)
        mbatches = graph.size(0)
        if nbatches != mbatches:
            graph_nodes = graph_nodes.transpose(0, 1)
        # adj (batch_size, K, K): adjacency matrix
        if not bool(graph.numel()):
            adj = self.get_adj(graph_nodes)
            adj_list = [adj,adj,adj,adj]
        else:
            adj = graph.float()
            adj_list = [adj[:,0,:],adj[:,1,:],adj[:,2,:],adj[:,0,:]]
        #print(adj)
        
        g_feature = \
            tuple([l(graph_nodes,x) for l, x in zip(self.graph,adj_list)])
        
        g_feature = self.norm(torch.cat(g_feature,2))
        #print('g_feature')
        #print(g_feature.shape)
        
        graph_encode_features = self.feed_foward(g_feature)
        
        return adj, graph_encode_features

# GCN
class GCN(nn.Module):
    def __init__(self, in_feat_dim, nhid, out_feat_dim, dropout):
        super(GCN, self).__init__()
        '''
        ## Inputs:
        - graph_nodes (batch_size, K, in_feat_dim): input features
        - adjacency matrix (batch_size, K, K)
        ## Returns:
        - gcn_enhance_feature (batch_size, K, out_feat_dim)
        '''
        self.gc1 = GraphConvolution(in_feat_dim, nhid)
        self.gc2 = GraphConvolution(nhid, out_feat_dim)
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return x
    
# Graph_Conv
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        #print(input.shape)
        #print(self.weight.shape)
        support = torch.matmul(input, self.weight)
        print('adj.shape')
        print(adj.shape)
        print('support.shape')
        print(support.shape)
        output = torch.matmul(adj, support)
        
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [88]:
hidden_size = 512
gcn = Graph_Module(hidden_size, hidden_size, hidden_size).cuda()

In [90]:
encoder_outputs = encoder_outputs.transpose(0, 1).cuda()
batch_graph = batch_graph.cuda()
_, encoder_outputs = gcn(encoder_outputs, batch_graph)

adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 512])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 128])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 512])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 128])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 512])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 128])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 512])
adj.shape
torch.Size([64, 51, 51])
support.shape
torch.Size([64, 51, 128])


In [86]:
print(output.shape)

torch.Size([64, 51, 512])


In [64]:
max_len = max(input_length)
print(max_len)

51


In [65]:
print(encoder_outputs.shape)
print(problem_output.shape)

torch.Size([51, 64, 512])
torch.Size([64, 512])


[62, 53, 53, 50, 48, 48, 45, 44, 43, 42, 42, 41, 39, 39, 38, 38, 38, 34, 34, 34, 34, 34, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 28, 27, 27, 26, 26, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 21, 21, 21, 19, 18, 17, 16, 16, 15, 13, 11, 6]


In [40]:
print(train_pairs[0])

([2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 11, 12, 13, 7, 14, 15, 1, 16, 17, 18, 19, 20, 21, 22, 17, 23, 24, 25, 26, 27, 28, 29, 17, 23, 1, 30, 26, 31, 32, 10, 33, 34, 16, 13], 44, [0, 1, 8, 5, 7], 5, ['2', '11'], [16, 34], [], [15, 16, 17, 32, 33, 34, 39, 40, 41])


In [38]:
print(pairs_trained[0])
print(pairs_tested[1])

(['镇海', '雅乐', '学校', '二年级', '的', '小朋友', '到', '一条', '小路', '的', '一边', '植树', '．', '小朋友', '们', '每隔', 'NUM', '米', '种', '一棵树', '（', '马路', '两头', '都', '种', '了', '树', '）', '，', '最后', '发现', '一共', '种', '了', 'NUM', '棵', '，', '这', '条', '小路', '长', '多少', '米', '．'], ['*', '-', 'N1', '1', 'N0'], ['2', '11'], [16, 34], [15, 16, 17, 32, 33, 34, 39, 40, 41])
(['张', '明', '有', 'NUM', '元', '钱', '，', '买', '书', '用', '去', 'NUM', '，', '买', '文具', '的', '钱', '是', '买', '书', '的', 'NUM', '．', '买', '文具', '用', '去', '多少', '元', '？'], ['*', '*', 'N0', 'N1', 'N2'], ['120', '80%', '15%'], [3, 11, 21], [1, 2, 3, 4, 7, 8, 9, 18, 19, 20, 21, 27, 28])


In [27]:
for i,j in zip(data,group_data):
    if i['id'] != j['id']:
        print(False)

In [14]:
print(data[0])
print(group_data[0])
print(pairs[0])

{'id': '1', 'original_text': '镇海雅乐学校二年级的小朋友到一条小路的一边植树．小朋友们每隔2米种一棵树（马路两头都种了树），最后发现一共种了11棵，这条小路长多少米．', 'segmented_text': '镇海 雅乐 学校 二年级 的 小朋友 到 一条 小路 的 一边 植树 ． 小朋友 们 每隔 2 米 种 一棵树 （ 马路 两头 都 种 了 树 ） ， 最后 发现 一共 种 了 11 棵 ， 这 条 小路 长 多少 米 ．', 'equation': 'x=(11-1)*2', 'ans': '20'}
{'id': '1', 'group_num': [15, 16, 17, 32, 33, 34, 39, 40, 41]}
(['镇海', '雅乐', '学校', '二年级', '的', '小朋友', '到', '一条', '小路', '的', '一边', '植树', '．', '小朋友', '们', '每隔', 'NUM', '米', '种', '一棵树', '（', '马路', '两头', '都', '种', '了', '树', '）', '，', '最后', '发现', '一共', '种', '了', 'NUM', '棵', '，', '这', '条', '小路', '长', '多少', '米', '．'], ['*', '-', 'N1', '1', 'N0'], ['2', '11'], [16, 34])


In [25]:
print(pairs_trained[0][0][16])
print(pairs_trained[0][0][34])

NUM
NUM


In [19]:
input_lang, output_lang, train_pairs, test_pairs = prepare_data(pairs_trained, pairs_tested, 5, generate_nums,
                                                                copy_nums, tree=True)

Indexing words...
keep_words 3928 / 10543 = 0.3726
Indexed 3931 words in input language, 23 words in output
Number of training data 21162
Number of testind data 1000


In [26]:
print(train_pairs[1])
print('-'*60)
print(test_pairs[0])

([35, 36, 37, 38, 26, 39, 40, 23, 1, 41, 26, 42, 43, 44, 45, 46, 47, 22, 40, 1, 41, 26, 48, 40, 23, 1, 45, 26, 49, 37, 50, 51, 38, 34, 41, 52], 36, [2, 7, 0, 8, 1, 9, 5], 7, ['316', '230', '6'], [8, 19, 25], [])
------------------------------------------------------------
([207, 35, 796, 2, 6, 1, 197, 481, 23, 1, 30, 484, 26, 58, 3269, 484, 1088, 6, 1903, 71, 1, 16, 26, 49, 796, 6, 439, 75, 34, 16, 52], 31, [0, 8, 9], 3, ['4', '44', '20'], [5, 9, 20], [])


In [21]:
print(len(train_pairs[0][0]))

44


In [19]:
# Initialize models
encoder = EncoderSeq(input_size=input_lang.n_words, embedding_size=embedding_size, hidden_size=hidden_size,
                     n_layers=n_layers)
predict = Prediction(hidden_size=hidden_size, op_nums=output_lang.n_words - copy_nums - 1 - len(generate_nums),
                     input_size=len(generate_nums))
generate = GenerateNode(hidden_size=hidden_size, op_nums=output_lang.n_words - copy_nums - 1 - len(generate_nums),
                        embedding_size=embedding_size)
merge = Merge(hidden_size=hidden_size, embedding_size=embedding_size)
# the embedding layer is  only for generated number embeddings, operators, and paddings

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
predict_optimizer = torch.optim.Adam(predict.parameters(), lr=learning_rate, weight_decay=weight_decay)
generate_optimizer = torch.optim.Adam(generate.parameters(), lr=learning_rate, weight_decay=weight_decay)
merge_optimizer = torch.optim.Adam(merge.parameters(), lr=learning_rate, weight_decay=weight_decay)

encoder_scheduler = torch.optim.lr_scheduler.StepLR(encoder_optimizer, step_size=20, gamma=0.5)
predict_scheduler = torch.optim.lr_scheduler.StepLR(predict_optimizer, step_size=20, gamma=0.5)
generate_scheduler = torch.optim.lr_scheduler.StepLR(generate_optimizer, step_size=20, gamma=0.5)
merge_scheduler = torch.optim.lr_scheduler.StepLR(merge_optimizer, step_size=20, gamma=0.5)

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    predict.cuda()
    generate.cuda()
    merge.cuda()

generate_num_ids = []
for num in generate_nums:
    generate_num_ids.append(output_lang.word2index[num])

In [20]:
#for epoch in range(n_epochs):
encoder_scheduler.step()
predict_scheduler.step()
generate_scheduler.step()
merge_scheduler.step()
loss_total = 0
input_batches, input_lengths, output_batches, output_lengths, nums_batches, num_stack_batches, \
    num_pos_batches, num_size_batches = prepare_train_batch(train_pairs, batch_size)
#print("fold:", fold + 1)
#print("epoch:", epoch + 1)

In [38]:
print(len(input_lengths))
print(input_lengths[0])
print(max(input_lengths[0]))

290
[50, 49, 47, 44, 41, 38, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 25, 25, 24, 24, 24, 24, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 15, 14, 11, 11, 10]
50


In [30]:
print(num_size_batches[0])

[5, 4, 5, 5, 4, 4, 2, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 5, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 3, 2, 2, 1, 2, 2, 1, 2, 3, 3, 2, 2]


In [33]:
idx = 0
(input_batch, input_length, target_batch, target_length, nums_stack_batch, num_size_batch, generate_nums,
               encoder, predict, generate, merge, encoder_optimizer, predict_optimizer, generate_optimizer,
               merge_optimizer, output_lang, num_pos) = (input_batches[idx], input_lengths[idx], output_batches[idx], output_lengths[idx],
    num_stack_batches[idx], num_size_batches[idx], generate_num_ids, encoder, predict, generate, merge,
    encoder_optimizer, predict_optimizer, generate_optimizer, merge_optimizer, output_lang, num_pos_batches[idx])
english=False

In [34]:
# sequence mask for attention
seq_mask = []
max_len = max(input_length)
for i in input_length:
    seq_mask.append([0 for _ in range(i)] + [1 for _ in range(i, max_len)])
seq_mask = torch.ByteTensor(seq_mask)

In [39]:
num_mask = []
max_num_size = max(num_size_batch) + len(generate_nums)
for i in num_size_batch:
    d = i + len(generate_nums)
    num_mask.append([0] * d + [1] * (max_num_size - d))
num_mask = torch.ByteTensor(num_mask)

In [41]:
unk = output_lang.word2index["UNK"]

# Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
input_var = torch.LongTensor(input_batch).transpose(0, 1)

target = torch.LongTensor(target_batch).transpose(0, 1)

padding_hidden = torch.FloatTensor([0.0 for _ in range(predict.hidden_size)]).unsqueeze(0)
batch_size = len(input_length)

In [50]:
print(padding_hidden.shape)

torch.Size([1, 512])


In [45]:
print(torch.LongTensor(input_batch).shape)
print(input_var.shape)
print(torch.LongTensor(target_batch).shape)
print(target.shape)

torch.Size([64, 50])
torch.Size([50, 64])
torch.Size([64, 9])
torch.Size([9, 64])


In [51]:
print(encoder)

EncoderSeq(
  (embedding): Embedding(3674, 128, padding_idx=0)
  (em_dropout): Dropout(p=0.5)
  (gru_pade): GRU(128, 512, num_layers=2, dropout=0.5, bidirectional=True)
)


In [52]:
encoder.train()
predict.train()
generate.train()
merge.train()

if USE_CUDA:
    input_var = input_var.cuda()
    seq_mask = seq_mask.cuda()
    padding_hidden = padding_hidden.cuda()
    num_mask = num_mask.cuda()

# Zero gradients of both optimizers
encoder_optimizer.zero_grad()
predict_optimizer.zero_grad()
generate_optimizer.zero_grad()
merge_optimizer.zero_grad()
# Run words through encoder

encoder_outputs, problem_output = encoder(input_var, input_length)

In [53]:
print(encoder_outputs.shape)
print(problem_output.shape)

torch.Size([50, 64, 512])
torch.Size([64, 512])


In [None]:
start = time.time()
for idx in range(len(input_lengths)):
    loss = train_tree(
        input_batches[idx], input_lengths[idx], output_batches[idx], output_lengths[idx],
        num_stack_batches[idx], num_size_batches[idx], generate_num_ids, encoder, predict, generate, merge,
        encoder_optimizer, predict_optimizer, generate_optimizer, merge_optimizer, output_lang, num_pos_batches[idx])
    loss_total += loss

print("loss:", loss_total / len(input_lengths))
print("training time", time_since(time.time() - start))
print("--------------------------------")

In [11]:
print(pairs_trained[0])

(['时代', '超市', '“', 'NUM', '一', '”', '大', '酬宾', '，', '全场', '家电', '按', 'NUM', '销售', '，', '原价', 'NUM', '元', '的', '电饭锅', '，', '现在', '售价', '=', '多少', '元', '．'], ['*', 'N2', 'N1'], ['5', '0.8', '150'], [3, 12, 16])


In [5]:
print(data[0])

{'id': '1', 'original_text': '镇海雅乐学校二年级的小朋友到一条小路的一边植树．小朋友们每隔2米种一棵树（马路两头都种了树），最后发现一共种了11棵，这条小路长多少米．', 'segmented_text': '镇海 雅乐 学校 二年级 的 小朋友 到 一条 小路 的 一边 植树 ． 小朋友 们 每隔 2 米 种 一棵树 （ 马路 两头 都 种 了 树 ） ， 最后 发现 一共 种 了 11 棵 ， 这 条 小路 长 多少 米 ．', 'equation': 'x=(11-1)*2', 'ans': '20'}


In [16]:
ori_path = '../graph_quantity_multigraph_trans/data/'
prefix = '23k_processed.json'
def get_train_test_fold(ori_path,prefix,data,pairs):
    mode_train = 'train'
    mode_valid = 'valid'
    mode_test = 'test'
    train_path = ori_path + mode_train + prefix
    valid_path = ori_path + mode_valid + prefix
    test_path = ori_path + mode_test + prefix
    train = read_json(train_path)
    train_id = [item['id'] for item in train]
    valid = read_json(valid_path)
    valid_id = [item['id'] for item in valid]
    test = read_json(test_path)
    test_id = [item['id'] for item in test]
    train_fold = []
    valid_fold = []
    test_fold = []
    for item,pair in zip(data, pairs):
        if item['id'] in train_id:
            train_fold.append(pair)
        elif item['id'] in test_id:
            test_fold.append(pair)
        else:
            valid_fold.append(pair)
    return train_fold, test_fold, valid_fold
train_fold, test_fold, valid_fold = get_train_test_fold(ori_path,prefix,data, pairs)

In [17]:
print(train_fold[0])

(['镇海', '雅乐', '学校', '二年级', '的', '小朋友', '到', '一条', '小路', '的', '一边', '植树', '．', '小朋友', '们', '每隔', 'NUM', '米', '种', '一棵树', '（', '马路', '两头', '都', '种', '了', '树', '）', '，', '最后', '发现', '一共', '种', '了', 'NUM', '棵', '，', '这', '条', '小路', '长', '多少', '米', '．'], ['*', '-', 'N1', '1', 'N0'], ['2', '11'], [16, 34])
