In [1]:
import re
import os
import sys
import json
import random
from importlib import reload

random.seed(666)

In [2]:
def get_program_seq(program):
    seq = []
    for item in program:
        func = item['function']
        inputs = item['inputs']
        seq.append(func + '(' + '<c>'.join(inputs) + ')')
    seq = '<b>'.join(seq)
    return seq

In [11]:
def get_sequence(program, idx):
    func = program[idx]

    if len(func['dependencies']) == 0:
        seq = func['function'] + '()'
        for arg in func['inputs']:
            seq += '<c>'
            seq += arg

    elif len(func['dependencies']) == 1:
        child_idx = func['dependencies'][0]
        child_seq = get_sequence(program, child_idx)

        current_seq = func['function']
        current_seq += '('
        for arg in func['inputs']:
            current_seq += '<c>'
            current_seq += arg
        current_seq += ')'

        seq = child_seq + '<s>' + current_seq

    elif len(func['dependencies']) == 2:
        left_idx = func['dependencies'][0]
        left_seq = get_sequence(program, left_idx)
        right_idx = func['dependencies'][1]
        right_seq = get_sequence(program, right_idx)

        current_seq = func['function']
        current_seq += '('
        for arg in func['inputs']:
            current_seq += '<c>'
            current_seq += arg
        current_seq += ')'

        if func['function'] in ['And', 'Or', 'SelectBetween'] and left_seq > right_seq:
            seq = right_seq + '<s>' + left_seq + '<s>' + current_seq
        else:
            seq = left_seq + '<s>' + right_seq + '<s>' + current_seq
    else:
        raise ValueError("Functions are not allowed to have more than 2 dependencies")

    return seq

In [12]:
# new data
train_data = json.load(open("./data/kqapro/dataset_new/train.json"))
train_sparql = [item['sparql'] for item in train_data]
train_program = [item['program'] for item in train_data]

In [13]:
train_program_1 = [get_program_seq(item) for item in train_program]
train_program_2 = [get_sequence(item, -1) for item in train_program]

In [15]:
train_program_1[-1], train_program_2[-1]

('FindAll()<b>FilterStr(official website<c>http://www.thesiege.com/)<b>FilterConcept(visual artwork)<b>QueryAttrQualifier(publication date<c>1999-01-21<c>place of publication)',
 'FindAll()<s>FilterStr(<c>official website<c>http://www.thesiege.com/)<s>FilterConcept(<c>visual artwork)<s>QueryAttrQualifier(<c>publication date<c>1999-01-21<c>place of publication)')

In [4]:
import KqaPro_Parser.program_v2.ProgramIRTranslator
reload(KqaPro_Parser.program_v2.ProgramIRTranslator)
from KqaPro_Parser.program_v2.ProgramIRTranslator import IR_translator
translator = IR_translator()

def gen_ir(i):
    ir = translator.program_to_ir(i)
    for token in ["<E>","</E>","<ES>","</ES>","<A>","</A>","<R>","</R>","<V>","</V>","<Q>","</Q>","<C>","</C>"]:
        ir = ir.replace(" {}".format(token), token)
        ir = ir.replace("{} ".format(token), token)
    return ir

In [5]:
# cfq_parser = sparql_parser.KqaParser(train_sparql)
ir_list = []
for point in train_program:
    try:
        ir = gen_ir(point)
        ir_list.append(ir)
    except Exception:
        print(get_program_seq(point))
        raise Exception

In [6]:
for i, t in zip(ir_list, train_data):
    if "the one whose" in i:
        print(i)
        print(get_program_seq(t["program"]))
        print(t["origin"])
        print(t["answer"])

In [5]:
i = 0
print(get_program_seq(train_program[i]))
print("\n")
ir = gen_ir(get_program_seq(train_program[i]))
print(ir)

FindAll()<b>FilterStr(TOID<c>4000000074573917)<b>FilterConcept(town)<b>FindAll()<b>FilterStr(OS grid reference<c>SP8778)<b>FilterConcept(town)<b>And()<b>What()


what is<ES><ES><C>town</C>whose<A>TOID</A>is text<V>4000000074573917</V></ES>and<ES><C>town</C>whose<A>OS grid reference</A>is text<V>SP8778</V></ES></ES>


In [12]:
def postprocess_ir(ir):
    for token in ["<E>","</E>","<ES>","</ES>","<A>","</A>","<R>","</R>","<V>","</V>","<Q>","</Q>","<C>","</C>"]:
        ir = ir.replace(" {}".format(token), token)
        ir = ir.replace("{} ".format(token), token)
    return ir

In [13]:
import KqaPro_Parser.ir.SparqlTranslator
from KqaPro_Parser.ir.SparqlTranslator import Sparql_translator
sparql_translator = Sparql_translator()

In [16]:
sparql_translator.ir_to_sparql(postprocess_ir(ir))

'SELECT DISTINCT ?e WHERE { ?e <pred:instance_of> ?c . ?c <pred:name> "town" . ?e <TOID> ?pv . ?pv <pred:value> "4000000074573917" . ?e <OS_grid_reference> ?pv_1 . ?pv_1 <pred:value> "SP8778" .  }'