In [1]:
import re
import os
import json
import random
from importlib import reload

random.seed(666)

In [None]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("./bart-base/")

In [38]:
tokenizer.add_tokens(["<E>","</E>","<A>","</A>","<R>","</R>","<V>","</V>","<Q>","</Q>","<C>","</C>"])
# entity, attribute, relation, value, qualifier, concept
tokenizer.save_pretrained("./bart-base/")

('./bart-base/tokenizer_config.json',
 './bart-base/special_tokens_map.json',
 './bart-base/vocab.json',
 './bart-base/merges.txt',
 './bart-base/added_tokens.json')

In [5]:
from bart2sparql.sparql_engine import get_sparql_answer
from utils.load_kb import DataForSPARQL

In [12]:
# loading CFQ IR parser
from IR_CFQ import sparql_parser
sparql_parser = reload(sparql_parser)

In [6]:
kb = DataForSPARQL(os.path.join("./dataset_full/", 'kb.json'))

In [51]:
def get_program_seq(program):
    seq = []
    for item in program:
        func = item['function']
        inputs = item['inputs']
        seq.append(func + '(' + '<c>'.join(inputs) + ')')
    seq = '<b>'.join(seq)
    return seq

### Dataset Preparation

In [13]:
train_data = json.load(open("./dataset_full/train.json"))
train_sparql = [item['sparql'] for item in train_data]
train_program = [item['program'] for item in train_data]

In [43]:
# new data
train_data = json.load(open("./dataset_new/train.json"))
train_sparql = [item['sparql'] for item in train_data]
train_program = [item['program'] for item in train_data]

In [9]:
val_data = json.load(open("./dataset_full/val.json"))
val_sparql = [item['sparql'] for item in val_data]
val_program = [item['program'] for item in val_data]

In [7]:
test_data = json.load(open("./dataset_full/test.json"))
test_sparql = [item['sparql'] for item in test_data]
test_program = [item['program'] for item in test_data]

### Dataset Overview

In [8]:
# measuring the average sequence length of sparql and program after being tokenized
sparql_total_len = 0
program_total_len = 0

assert len(train_sparql) == len(train_program)
for s, p in zip(train_sparql, train_program):
    sparql_total_len += len(tokenizer(s)['input_ids'])
    program_total_len += len(tokenizer(get_program_seq(p))['input_ids'])

print("sparql avg len: %f\nprogram avg len: %f" % (sparql_total_len / len(train_sparql), program_total_len / len(train_program)))

sparql avg len: 103.476964
program avg len: 52.921400


In [10]:
# measuring the average sequence length of ir after being tokenized
ir_total_len = 0
template_total_len = 0
train_template = [i["origin"] for i in train_data]

assert len(train_ir) == len(train_template)
for ir, template in zip(train_ir, train_template):
    ir_total_len += len(tokenizer(ir)['input_ids'])
    template_total_len += len(tokenizer(template)['input_ids'])

print("Template avg len: %f\nIR avg len: %f" % (template_total_len / len(train_template), ir_total_len / len(train_ir)))

Template avg len: 25.130510
IR avg len: 39.855832


### Testing Parsers

In [None]:
from KqaPro_Parser.program_v2 import ProgramIRTranslator
translator = ProgramIRTranslator.IR_translator()
# cfq_parser = sparql_parser.KqaParser(train_sparql)
for point in train_data:
    if "Manhattan" in point["origin"]:
        print(point["origin"])
        print(point["rewrite"])
        print(get_program_seq(point["program"]))
        print(translator.program_to_ir(point["program"]))
        # print(cfq_parser.f_reversible(point["sparql"]))
        print("\n")


In [None]:
program_func = "Select"
target = []
for i in new_train_data:
    s = i['rewrite']
    p = i['program']
    q = i['sparql']
    if program_func in get_program_seq(p):
        target.append(p)
        if len(target) == 20:
            break

for t in target:
    print(get_program_seq(t))
    print(program_to_ir(t))

In [None]:
pred = [line for line in open("./exp_results/UIR/val_predict.txt")]
gold = [line for line in open("./exp_results/UIR/val_gold.txt")]
wrong = []
test_nl = [item["rewrite"] for item in val_data]
for nl, g, p in zip(test_nl, gold, pred):
    if g != p:
        wrong.append(["nl: "+nl.strip(), "gold: "+g.strip(),"pred: "+p.strip()])
print(len(wrong))
wrong

In [17]:
import pickle
parser = pickle.load(open("./exp_files/CFQ_IR/full/parser.pkl",'rb'))
from utils.data import DataLoader, DistributedDataLoader, prepare_dataset
train_dataset, train_vocab = prepare_dataset("./exp_files/CFQ_IR/full/vocab.json", "./exp_files/CFQ_IR/full/train.pt", training=True, pretrain=False)

#vocab of answer: 81629


### Self-correct

In [4]:
from bart2program.executor_rule_new import RuleExecutor
# from utils.data import load_vocab
# vocab = load_vocab(os.path.join("./exp_files_new/UIR/full/", 'vocab.json'))
rule_executor = RuleExecutor(os.path.join("./dataset_new/", 'kb.json'))

load kb


In [5]:
rule_executor.entity_name_to_ids['Q7325']

[]

In [6]:
entities = rule_executor.entities
concepts = rule_executor.concepts

In [7]:
unique_entity = set()
unique_attribute = set()
unique_relation = set()
# unique_qualifier = set()

entity_mapping = dict()
entity_attribute_mapping = dict()
entity_relation_mapping = dict()

for name_id, entity in entities.items():
    entity_mapping[name_id] = entity["name"]
    unique_entity.add(entity["name"])

    entity_attribute_mapping[name_id] = set()
    entity_relation_mapping[name_id] = set()

    for attribute in entity["attributes"]:
        unique_attribute.add(attribute["key"])
        entity_attribute_mapping[name_id].add(attribute["key"])    
    for relation in entity["relations"]:
        unique_relation.add(relation["predicate"])
        entity_relation_mapping[name_id].add(relation["predicate"])  

In [76]:
print(len(entity_mapping.keys()))
print(len(entity_attribute_mapping.keys()))
print(len(unique_entity))
print(len(unique_attribute))
print(len(unique_relation))  

16960
16960
13693
629
363


In [8]:
# find and count repeating entities
repeat_entity = dict()
for name_id, entity in entity_mapping.items():
    if entity in repeat_entity.keys():
        repeat_entity[entity][0] += 1
        repeat_entity[entity][1].append(name_id)
    else:
        repeat_entity[entity] = [1, [name_id]]

In [82]:
unique_concept = set()
concept_mapping = dict()
concept_relation_mapping = dict()

for name_id, concept in concepts.items():
    concept_mapping[name_id] = concept["name"]
    unique_concept.add(concept["name"])
    
    concept_relation_mapping[name_id] = set()
    try:
        for relation in concept["relations"]:
            unique_relation.add(relation["predicate"])
            concept_relation_mapping[name_id].add(relation["predicate"])
    except:
        pass

In [54]:
"Howard Ashmans" in unique_entity

False

In [83]:
print(len(unique_concept))
print(len(unique_relation)) 

791
363


In [87]:
for i in unique_relation:
    try:
        assert "{" not in i and "}" not in i
    except:
        print(i)

In [2]:
import re
ir = """what is the <Q> for work </Q> of <E> <E> the one that <R> ethnic group </R> backward to <E> Jewish people </E> </E> and <E> John Houseman </E> </E> that <R> nominated for </R> to <E> Academy Award for Best Picture </E>"""

In [1]:
from IR_unified.self_correct import IRCorrector
corrector = IRCorrector()

load kb


In [29]:
entities = re.findall(r'\{([^{}]+)\}', ir.replace("<E>", "{").replace("</E>", "}"))
entities[0].strip() 


'Jewish people'

In [30]:
import difflib
difflib.get_close_matches('Jewish people', corrector.unique_entity)

['Welsh people', 'English people', 'white people']

In [4]:
corrector.concept_mapping['Q7325']

'Jewish people'

### Results checking

In [7]:
from bart2sparql.sparql_engine import get_sparql_answer
test_data = json.load(open("./dataset_new/test.json"))
test_sparql = [item['sparql'] for item in test_data]

gold_answer = []
for sparql in test_sparql:
    gold_answer = get_sparql_answer(sparql, kb)
    