In [24]:
from xml.etree import ElementTree
from collections import Counter
from random import sample
import random
import numpy as np
import json
import re
from termcolor import colored
import os

path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)

# AsDiv

In [25]:
dom = ElementTree.parse("data/nlu-asdiv-dataset/dataset/ASDiv.xml")

# XML parsing
body_list = dom.findall("ProblemSet/Problem/Body")
answer_list = dom.findall("ProblemSet/Problem/Answer")
question_list = dom.findall("ProblemSet/Problem/Question")
formula_list = dom.findall("ProblemSet/Problem/Formula")
stype_list = dom.findall("ProblemSet/Problem/Solution-Type")

print(Counter(stype.text for stype in stype_list))
list_stype_cat = list(Counter(stype.text for stype in stype_list).keys())
indexes_dict = {
    stype: [i for i, stype_ in enumerate(stype_list) if stype_.text == stype]
    for stype in list_stype_cat
}

nr_sample_dict = {
    "Subtraction": 2,
    "Addition": 2,
    "Multiplication": 2,
    "Common-Division": 2,
    "Sum": 1,
    "Ratio": 1,
    "Geometry": 1,
    "Algebra-2": 1,
    "TVQ-Final": 1,
    "GCD": 1,
    "Algebra-1": 1,
    "Surplus": 1,
    "LCM": 1,
    "Difference": 1,
    "Comparison": 1,
    "Floor-Division": 1,
    "Sequential-Operation": 0,
    "Ceil-Division": 0,
    "TVQ-Change": 0,
    "TVQ-Initial": 0,
    "Set-Operation": 0,
}
print(sum(nr_sample_dict.values()))

random.seed(0)
sample_idx_dict = {
    stype: sample(indexes_dict[stype], nr_sample_dict[stype])
    for stype in nr_sample_dict.keys()
}
print(sample_idx_dict)


def print_as_div_from_idx(sample_idx):
    print(f"{body_list[sample_idx].text} {question_list[sample_idx].text}")
    print(f"{formula_list[sample_idx].text}")
    print(f"{answer_list[sample_idx].text}")
    print("\n" + "-" * 100 + "\n")


Counter({'Subtraction': 452, 'Addition': 351, 'Multiplication': 260, 'Common-Division': 224, 'Sum': 95, 'Ratio': 94, 'Geometry': 88, 'Algebra-2': 83, 'TVQ-Final': 80, 'GCD': 73, 'Algebra-1': 71, 'Surplus': 70, 'LCM': 65, 'Difference': 59, 'Comparison': 57, 'Floor-Division': 52, 'Sequential-Operation': 37, 'Ceil-Division': 32, 'TVQ-Change': 20, 'TVQ-Initial': 19, 'Set-Operation': 15, 'UnitTrans': 4, 'Number-Operation': 2, 'Number-Pattern': 1, 'Substraction': 1})
20
{'Subtraction': [1936, 702], 'Addition': [1106, 50], 'Multiplication': [570, 2250], 'Common-Division': [1582, 2281], 'Sum': [218], 'Ratio': [1659], 'Geometry': [1525], 'Algebra-2': [1477], 'TVQ-Final': [883], 'GCD': [2084], 'Algebra-1': [1302], 'Surplus': [800], 'LCM': [1224], 'Difference': [1282], 'Comparison': [189], 'Floor-Division': [1558], 'Sequential-Operation': [], 'Ceil-Division': [], 'TVQ-Change': [], 'TVQ-Initial': [], 'Set-Operation': []}


In [49]:
print_as_div_from_idx(sample_idx_dict["Floor-Division"][0])

Each house a carpenter builds needs six sinks. If he bought two hundred sixty-six sinks, how many houses would that cover?
266/6=44 r2
44 (houses)

----------------------------------------------------------------------------------------------------



In [27]:
print_as_div_from_idx(605)

Tony had $20. He paid $8 for a ticket to a baseball game. At the game, he bought a hot dog for $3. What amount of money did Tony have then?
20-8-3=9
9 (dollars)

----------------------------------------------------------------------------------------------------



# GSM8k

In [2]:
def load_gsm8k(dataset_path):
    with open(dataset_path) as fh:
        data = [json.loads(line) for line in fh.readlines() if line]

    return data


def print_gsm8k_sample_from_idx(sample_idx):
    data = load_gsm8k("data/grade-school-math/grade_school_math/data/train.jsonl")
    problem = data[sample_idx]
    print(problem["question"])
    print(colored(problem["answer"], "green"))
    print(colored(re.findall(r"#### \w+", problem["answer"])[0][5:], "yellow"))
    print("\n" + "-" * 100 + "\n")

np.random.seed(1)
rand_index = list(np.random.randint(0, len(load_gsm8k("data/grade-school-math/grade_school_math/data/train.jsonl")), 15))
print(rand_index)

[5157, 235, 3980, 5192, 905, 2763, 2895, 5056, 144, 4225, 2797, 6652, 3462, 7449, 1202]


In [22]:
print_gsm8k_sample_from_idx(3335)

The average score on last week's Spanish test was 90.  Marco scored 10% less than the average test score and Margaret received 5 more points than Marco.  What score did Margaret receive on her test?
[32mThe average test score was 90 and Marco scored 10% less so 90*.10 = <<90*.10=9>>9 points lower
The average test score was 90 and Marco scored 9 points less so his test score was 90-9 = <<90-9=81>>81
Margret received 5 more points than Marco whose test score was 81 so she made 5+81 = <<5+81=86>>86 on her test
#### 86[0m
[33m86[0m

----------------------------------------------------------------------------------------------------



In [12]:
"""Testing"""
np.random.seed(2)
rand_index = list(np.random.randint(0, len(load_gsm8k("data/grade-school-math/grade_school_math/data/train.jsonl")), 10))
print(rand_index)

[7336, 2575, 6637, 5704, 3606, 6443, 2514, 1099, 6504, 3335]


# SingleEq

In [86]:
def sample_singleEq(dataset_path):
    with open(dataset_path, "r") as f:
        data = json.load(f)

    # Randomly choose a problem
    rand_index = np.random.randint(0, len(data))
    problem = data[rand_index]
    return problem["sQuestion"], problem["lSolutions"]


def load_singleEq(dataset_path):
    with open(dataset_path, "r") as f:
        data = json.load(f)

    return data


def print_singleEq_sample_from_idx(sample_idx):
    data = load_singleEq("data/TACL2015/questions.json")
    problem = data[sample_idx]
    print(problem["sQuestion"])
    print(colored(problem["lSolutions"], "green"))
    print("\n" + "-" * 100 + "\n")


np.random.seed(1)
rand_index = list(
    np.random.randint(0, len(load_singleEq("data/TACL2015/questions.json")), 15)
)
print(rand_index)


[37, 235, 396, 72, 255, 393, 203, 133, 335, 448, 144, 129, 460, 71, 237]


In [116]:
print_singleEq_sample_from_idx(237)

A car company produced 3,884 cars in North America and 2,871 cars in Europe. How many cars is that in all?
[32m[6755.0][0m

----------------------------------------------------------------------------------------------------

