In [122]:
import json
import collections

In [43]:
with open("../data/raw/train.json") as json_file:
    data = json.load(json_file) 

In [498]:
raw_all, raw_t1, raw_t2 = [], {}, {}
for i, e in enumerate(data):
    type_ = 1 if "qa" in e else 2 if "qa_0" in e and "qa_1" in e else 3
    e = e["annotation"]
    e["type"] = type
    raw_all.append(e)
    if type_ == 1:
        raw_t1[i] = e
    elif type_ == 2:
        raw_t2[i] = e
    else:
        print("unexpected")

# Preparation

Finding out all the different answer formats until bad returns as an empty list.

In [80]:
def answer_format(entries):
    bad = []
    for e in entries:
        for answer in e["answer_list"]:
            if answer.startswith("A"):
                try:
                    int(answer[1:])
                except:
                    bad.append(e)
            else:
                try:
                    if answer.endswith("%"):
                        float(answer[:-1])
                    else:
                        float(answer)
                except:
                    bad.append(e)
    return bad

In [504]:
answer_format1(raw_t1.values())

[]

Separating a step into the operation and its arguments.

In [116]:
def process_step(step):
    step = step.replace(" ", "")
    op, args = step.split("(")
    arg1, arg2 = args.split(")")[0].split(",")
    return op, arg1, arg2

Seeing all the different types of arguments.

In [406]:
def see_arg_types(entries):
    arg_types = collections.defaultdict(int)
    for e in entries:
        for step, answer, q in zip(e["step_list"], e["answer_list"], e["dialogue_break_ori"]):
            if q == "n/a":
                continue
            if answer.startswith("A") or answer.startswith("#"):
                op, arg1, arg2 = process_step(step)
                arg_types[get_arg_type(arg1)] += 1
                arg_types[get_arg_type(arg2)] += 1
    return arg_types

def get_arg_type(arg):
    if arg.startswith("A"):
        return "A"
    elif arg.startswith("#"):
        return "#"
    elif arg.endswith("%"):
        return "%"
    elif arg.startswith("const_m"):
        return "N"
    elif arg.startswith("const_"):
        return "P"
    else:
        try:
            float(arg)
            return "F"
        except:
            return arg

In [506]:
see_arg_types(raw_t1.values())

defaultdict(int, {'F': 5870, 'A': 2516, 'P': 790, '%': 56, 'N': 6})

Now that I know what all the types of arguments are, I can process them.

In [437]:
def process_arg(arg, ans_map):
    if arg.startswith("A") or arg.startswith("#"):
        return ans_map[arg]
    elif arg.endswith("%"):
        return float(arg[:-1]) / 100
    elif arg.startswith("const_m"):
        return -int(arg[7:])
    elif arg.startswith("const_"):
        return int(arg[6:])
    else:
        return float(arg)

Create a list of answers in a clean, standard format for easy comparison.

In [None]:
def build(op, arg1, arg2, step_list):
    if not arg1.startswith("#") and not arg2.startswith("#"):
        return f"{op}({arg1}, {arg2})"
    answer = []
    if arg1.startswith("#"):
        i = int(arg1[1:])
        prev_op, prev_arg1, prev_arg2 = process_step(step_list[i])
        prev_arg1, prev_arg2 = process_arg(prev_arg1, ans_map), process_arg(prev_arg2, ans_map)
        prev_answer = build(prev_op, prev_arg1, prev_arg2, step_list)
        answer.append(prev_answer)

In [638]:
def create_answers(e):
    a = []
    ans_map = {}
    j = 0
    for i, (step, answer, q) in enumerate(zip(e["step_list"], e["answer_list"], e["dialogue_break_ori"])):
        if q == "n/a":
            if answer.startswith("A") or answer.startswith("#"):
                try:
                    ans_map[answer] = f"ANS{e["step_list"][:i].index(step)}"
                except ValueError:
                    ans_map[answer] = "multi"
            continue
        if answer.startswith("A") or answer.startswith("#"):
            ans_map[answer] = f"ANS{j}"
            op, arg1, arg2 = process_step(step)
            arg1, arg2 = process_arg(arg1, ans_map), process_arg(arg2, ans_map)
            if arg1 == "multi" or arg2 == "multi":
                return [], True
            a.append(f"{op}({arg1}, {arg2})")
        else:
            a.append(answer)
        j += 1
    return a, False

# Producing the Final Datasets

In [694]:
def create_dataset(entries, type_):
    t = {}
    multis = []
    for i, raw_e in entries.items():
        answers, multi = create_answers1(raw_e)
        if multi:
            multis.append(i)
        else:
            e = {"type": type_}
            e["context"] = " ".join([raw_e["amt_pre_text"], raw_e["amt_table"], raw_e["amt_post_text"]])
            e["answers"] = answers
            e["dialogue_break"] = raw_e["dialogue_break"]
            e["exe_ans_list"] = raw_e["exe_ans_list"]
            t[i] = e
    return t, multis

In [712]:
t1, multis1 = create_dataset(raw_t1, 1)
t2, multis2 = create_dataset(raw_t2, 2)

In [758]:
t3 = {}
for i in multis:
    raw_e = raw_all[i]
    e = {"type": 3}
    #e["context"] = " ".join([raw_e["amt_pre_text"], raw_e["amt_table"], raw_e["amt_post_text"]])
    e["answers"] = []
    e["dialogue_break"] = raw_e["dialogue_break"]
    e["exe_ans_list"] = raw_e["exe_ans_list"]
    e["dialogue_break_ori"] = raw_e["dialogue_break_ori"]
    e["step_list"] = raw_e["step_list"]
    e["answer_list"] = raw_e["answer_list"]
    t3[i] = e

In [754]:
t = [t1, t2, t3]

In [756]:
for i in range(3):
    with open(f"../data/processed/train{i+1}.json", "w") as out:
        out.write(json.dumps(t[i], indent=4))

In [786]:
t3[3003]

{'type': 3,
 'answers': [],
 'dialogue_break': ['as of december 31, 2017, how much did the balance of class a common stock represent in relation to the one of class b common stock?',
  'and what is this class b balance as a percentage of the total number of authorized shares of that stock, in that same date?'],
 'exe_ans_list': [0.5972, 0.5656],
 'dialogue_break_ori': ['n/a',
  'n/a',
  'as of december 31, 2017, how much did the balance of class a common stock represent in relation to the one of class b common stock?',
  'n/a',
  'n/a',
  'and what is this class b balance as a percentage of the total number of authorized shares of that stock, in that same date?'],
 'step_list': ['Ask for number 422208',
  'Ask for number 706985',
  'divide(422208, 706985)',
  'multiply(1.25, const_1000)',
  'Ask for number 707',
  'divide(707, #1)'],
 'answer_list': ['422208', '706985', '#0', '#1', '707', '#2']}