In [1]:
import re
import os
import sys
import json
import random
from importlib import reload

random.seed(666)

In [1]:
def get_program_seq(program):
    seq = []
    for item in program:
        func = item['function']
        inputs = item['inputs']
        seq.append(func + '(' + '<c>'.join(inputs) + ')')
    seq = '<b>'.join(seq)
    return seq

### Preparing Packages

In [1]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("./bart-base/")

2021-12-06 18:30:55.317386: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
tokenizer.add_tokens(["<b>", "<c>", "<E>","</E>","<A>","</A>","<R>","</R>","<V>","</V>","<Q>","</Q>","<C>","</C>"])
# entity, attribute, relation, value, qualifier, concept
tokenizer.save_pretrained("./bart-base/")

('./bart-base/tokenizer_config.json',
 './bart-base/special_tokens_map.json',
 './bart-base/vocab.json',
 './bart-base/merges.txt',
 './bart-base/added_tokens.json')

In [None]:
from bart2query.sparql.sparql_engine import get_sparql_answer
from utils.load_kb import DataForSPARQL

In [None]:
# loading CFQ IR parser
from IR_CFQ import sparql_parser
sparql_parser = reload(sparql_parser)

In [None]:
kb = DataForSPARQL(os.path.join("./dataset_new/", 'kb.json'))

### Dataset Preparation

In [None]:
train_data = json.load(open("./dataset_full/train.json"))
train_sparql = [item['sparql'] for item in train_data]
train_program = [item['program'] for item in train_data]

In [None]:
val_data = json.load(open("./dataset_full/val.json"))
val_sparql = [item['sparql'] for item in val_data]
val_program = [item['program'] for item in val_data]

In [None]:
test_data = json.load(open("./dataset_full/test.json"))
test_sparql = [item['sparql'] for item in test_data]
test_program = [item['program'] for item in test_data]

In [3]:
# new data
train_data = json.load(open("./dataset_new/train.json"))
train_sparql = [item['sparql'] for item in train_data]
train_program = [item['program'] for item in train_data]

In [None]:
val_data = json.load(open("./dataset_new/val.json"))
val_sparql = [item['sparql'] for item in val_data]
val_program = [item['program'] for item in val_data]

In [None]:
test_data = json.load(open("./dataset_new/test.json"))
test_sparql = [item['sparql'] for item in test_data]
test_program = [item['program'] for item in test_data]

### Dataset Overview

In [None]:
# measuring the average sequence length of sparql and program after being tokenized
sparql_total_len = 0
program_total_len = 0

assert len(train_sparql) == len(train_program)
for s, p in zip(train_sparql, train_program):
    sparql_total_len += len(tokenizer(s)['input_ids'])
    program_total_len += len(tokenizer(get_program_seq(p))['input_ids'])

print("sparql avg len: %f\nprogram avg len: %f" % (sparql_total_len / len(train_sparql), program_total_len / len(train_program)))

In [None]:
# measuring the average sequence length of ir after being tokenized
ir_total_len = 0
template_total_len = 0
train_template = [i["origin"] for i in train_data]

assert len(train_ir) == len(train_template)
for ir, template in zip(train_ir, train_template):
    ir_total_len += len(tokenizer(ir)['input_ids'])
    template_total_len += len(tokenizer(template)['input_ids'])

print("Template avg len: %f\nIR avg len: %f" % (template_total_len / len(train_template), ir_total_len / len(train_ir)))

### Testing Parsers

In [4]:
import KqaPro_Parser.CallProgramParser
reload(KqaPro_Parser.CallProgramParser)
from KqaPro_Parser.CallProgramParser import ParsingProgram
parser = ParsingProgram()
parser.parse(train_program[0])

(query (whatEntityQuery (entitySet (entitySet (entitySet (entitySet (findAll FindAll())) (filterAttr (filterStr FilterStr( (key (string TOID)) <c> (value (string 4000000074573917)) )))) (filterConcept FilterConcept( (concept (string town)) ))) (entitySet (entitySet (entitySet (findAll FindAll())) (filterAttr (filterStr FilterStr( (key (string OS grid reference)) <c> (value (string SP8778)) )))) (filterConcept FilterConcept( (concept (string town)) ))) (setOP (intersect And()))) (queryName What())) <EOF>)


In [None]:
(query (whatEntityQuery (entitySet (entitySet (entitySet (entitySet (findAll FindAll())) (filterAttr (filterStr FilterStr( (key (string TOID)) <c> (value (string 4000000074573917)) )))) (filterConcept FilterConcept( (concept (string town)) ))) (entitySet (entitySet (entitySet (findAll FindAll())) (filterAttr (filterStr FilterStr( (key (string OS grid reference)) <c> (value (string SP8778)) )))) (filterConcept FilterConcept( (concept (string town)) ))) (setOP (intersect And()))) (queryName What())) <EOF>)

In [None]:
target = []
for i in train_data:
    program = get_program_seq(i['program'])
    if "And(" in program:
        target.append([i['sparql'], program, gen_ir(i['program'])])
    if len(target) > 10:
        break
target
    

In [6]:
import KqaPro_Parser.program_v2.ProgramIRTranslator
reload(KqaPro_Parser.program_v2.ProgramIRTranslator)
from KqaPro_Parser.program_v2.ProgramIRTranslator import IR_translator
translator = IR_translator()

def gen_ir(i):
    ir = translator.program_to_ir(i)
    for token in ["<E>","</E>","<ES>","</ES>","<A>","</A>","<R>","</R>","<V>","</V>","<Q>","</Q>","<C>","</C>"]:
        ir = ir.replace(" {}".format(token), token)
        ir = ir.replace("{} ".format(token), token)
    return ir

In [7]:
# cfq_parser = sparql_parser.KqaParser(train_sparql)
ir_list = []
for point in train_program[:10]:
    try:
        ir = gen_ir(point)
        ir_list.append(ir)
    except Exception:
        print(get_program_seq(point))
        raise Exception

Find(Georgia national football team)<b>QueryAttrQualifier(ranking<c>78<c>review score by)


Exception: 

In [7]:
ir_list[:10]

['what is<ES><ES>the<C>town</C><ES><ES>the ones</ES></ES></ES>and<ES>the<C>town</C><ES><ES>the ones</ES></ES></ES></ES>',
 'what is the qualifier<Q>review score by</Q>of<E>Georgia national football team</E>whose<A>ranking</A>is<V>78</V>',
 'whether<ES>the<C>human</C><E>high school</E></ES><A>name in native language</A>is text<V>Laura Linney</V>',
 'which one has the largest<A>elevation above sea level</A>among<ES><E>Baghdad</E>or<ES><E>Jerusalem</E></ES></ES>',
 'what is the attribute<A>date of birth</A>of<ES>the<C>human</C><ES><ES>the ones</ES></ES></ES>',
 'what is the qualifier<Q>for work</Q>of<ES><ES>the ones <E>Jewish people</E></ES>and<E>John Houseman</E></ES>that<R>nominated for</R>to<E>Academy Award for Best Picture</E>',
 'what is the qualifier<Q>point in time</Q>of<ES>the<C>big city</C><ES><ES>the ones</ES></ES></ES>whose<A>population</A>is<V>104072</V>',
 'whether<E>Eve Myles</E><A>official website</A>is text<V>http://www.cheechandchong.com</V>',
 'what is the relation from<

In [None]:
target=[]
for ir in ir_list:
    if "what is the qualifier" in ir:
        target.append(ir)
    if len(target) > 10:
        break
target

In [None]:
from KqaPro_Parser.ir.UnifiedIRLexer import *
from KqaPro_Parser.ir.UnifiedIRParser import *

from antlr4 import *
from antlr4.InputStream import InputStream
wrong_list = []
for ir in ir_list:
    input_stream = InputStream(ir)
    lexer = UnifiedIRLexer(input_stream)
    token_stream = CommonTokenStream(lexer)
    parser = UnifiedIRParser(token_stream)
    try:
        tree = parser.query()
    except:
        wrong_list.append(ir)


In [None]:
wrong_list

In [None]:
import json
from KqaPro_Parser.program import ProgramIRTranslator
translator = ProgramIRTranslator.IR_translator()
train_data = json.load(open("./dataset_full/train.json"))
for point in train_data:
    translator.program_to_ir(point["program"])

In [None]:
program_func = "Select"
target = []
for i in new_train_data:
    s = i['rewrite']
    p = i['program']
    q = i['sparql']
    if program_func in get_program_seq(p):
        target.append(p)
        if len(target) == 20:
            break

for t in target:
    print(get_program_seq(t))
    print(program_to_ir(t))

In [None]:
pred = [line for line in open("./exp_results/UIR/val_predict.txt")]
gold = [line for line in open("./exp_results/UIR/val_gold.txt")]
wrong = []
test_nl = [item["rewrite"] for item in val_data]
for nl, g, p in zip(test_nl, gold, pred):
    if g != p:
        wrong.append(["nl: "+nl.strip(), "gold: "+g.strip(),"pred: "+p.strip()])
print(len(wrong))
wrong

In [None]:
import pickle
parser = pickle.load(open("./exp_files/CFQ_IR/full/parser.pkl",'rb'))
from utils.data import DataLoader, DistributedDataLoader, prepare_dataset
train_dataset, train_vocab = prepare_dataset("./exp_files/CFQ_IR/full/vocab.json", "./exp_files/CFQ_IR/full/train.pt", training=True, pretrain=False)

### Self-correct

In [None]:
from bart2query.program.executor_rule_new import RuleExecutor
# from utils.data import load_vocab
# vocab = load_vocab(os.path.join("./exp_files_new/UIR/full/", 'vocab.json'))
rule_executor = RuleExecutor(os.path.join("./dataset_new/", 'kb.json'))

In [None]:
rule_executor.key_type["publication date"]

In [None]:
entities = rule_executor.entities
concepts = rule_executor.concepts

In [None]:
unique_entity = set()
unique_attribute = set()
unique_relation = set()
# unique_qualifier = set()

entity_mapping = dict()
entity_attribute_mapping = dict()
entity_relation_mapping = dict()

for name_id, entity in entities.items():
    entity_mapping[name_id] = entity["name"]
    unique_entity.add(entity["name"])

    entity_attribute_mapping[name_id] = set()
    entity_relation_mapping[name_id] = set()

    for attribute in entity["attributes"]:
        unique_attribute.add(attribute["key"])
        entity_attribute_mapping[name_id].add(attribute["key"])    
    for relation in entity["relations"]:
        unique_relation.add(relation["predicate"])
        entity_relation_mapping[name_id].add(relation["predicate"])  

In [None]:
print(len(entity_mapping.keys()))
print(len(entity_attribute_mapping.keys()))
print(len(unique_entity))
print(len(unique_attribute))
print(len(unique_relation))  

In [None]:
# find and count repeating entities
repeat_entity = dict()
for name_id, entity in entity_mapping.items():
    if entity in repeat_entity.keys():
        repeat_entity[entity][0] += 1
        repeat_entity[entity][1].append(name_id)
    else:
        repeat_entity[entity] = [1, [name_id]]

In [None]:
unique_concept = set()
concept_mapping = dict()
concept_relation_mapping = dict()

for name_id, concept in concepts.items():
    concept_mapping[name_id] = concept["name"]
    unique_concept.add(concept["name"])
    
    concept_relation_mapping[name_id] = set()
    try:
        for relation in concept["relations"]:
            unique_relation.add(relation["predicate"])
            concept_relation_mapping[name_id].add(relation["predicate"])
    except:
        pass

In [None]:
print(len(unique_concept))
print(len(unique_relation)) 

In [None]:
for i in unique_relation:
    try:
        assert "{" not in i and "}" not in i
    except:
        print(i)

In [None]:
import re
ir = """what is the <Q> for work </Q> of <E> <E> the one that <R> ethnic group </R> backward to <E> Jewish people </E> </E> and <E> John Houseman </E> </E> that <R> nominated for </R> to <E> Academy Award for Best Picture </E>"""

In [None]:
from IR_unified.self_correct import IRCorrector
corrector = IRCorrector()

In [None]:
entities = re.findall(r'\{([^{}]+)\}', ir.replace("<E>", "{").replace("</E>", "}"))
entities[0].strip() 


### Results checking

In [None]:
pred_sparql = [line.strip() for line in open("./sparql_results.txt")]

In [None]:
from tqdm import tqdm
complete_same = 0
execute_same = 0
wrong = []
assert len(pred_sparql) == len(test_sparql)
for pred, gold, query, i in tqdm(zip(pred_sparql, test_sparql, range(len(test_data)))):
    if pred.strip() == gold.strip():
        complete_same += 1
        execute_same += 1
    elif get_sparql_answer(pred, kb) == get_sparql_answer(gold, kb):
        execute_same += 1
    else:
        wrong.append([query['origin'], pred, gold])


In [None]:
complete_same, execute_same, len(wrong), len(pred_sparql)

In [None]:
import pickle
test_pickle = []
for i in range(5):
    test_pickle.append(pickle.load(open("./exp_files_new/UIR/full/end2end/test.pt", 'rb')))

In [None]:
test_ir = [tokenizer.decode(test_pickle[0][i], skip_special_tokens=True) for i in range(len(test_pickle))]

In [None]:
(complete_same/len(pred_sparql), execute_same/len(pred_sparql))


In [None]:
wrong[:10]