# Testing flan-t5 performance on ASDIV dataset 

In [48]:
import datasets
import torch
import re
import json

import pickle
import numpy as np
import xml.etree.ElementTree as ET

from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [76]:
import sys
sys.path.append('..')

In [77]:
from src.utils import parse_pred_ans

In [10]:
tree = ET.parse('../data/asdiv/dataset/ASDiv.xml')

In [12]:
root = tree.getroot()

In [15]:
for child in root:
    print(child.tag, child.attrib)

ProblemSet {}


In [43]:
questions = []
answers = []
equations = []
for c in child:
    q = ''
    a = ''
    f = ''
    for ci in c:
        if(ci.tag == 'Body'): q += ci.text
        if(ci.tag == 'Question'): q += ' ' + ci.text
        if(ci.tag == 'Answer'): a += ci.text
        if(ci.tag == 'Formula'): f += ci.text
    questions.append(q)
    answers.append(a)
    equations.append(f)

In [47]:
len(questions), len(answers), len(equations)

(2305, 2305, 2305)

In [44]:
questions[0]

'Seven red apples and two green apples are in the basket. How many apples are in the basket?'

In [45]:
answers[0]

'9 (apples)'

In [46]:
equations[0]

'7+2=9'

In [49]:
# pickle.dump(questions, open('../processed_data/ASDiv_questions.pkl', 'wb'))
# pickle.dump(answers, open('../processed_data/ASDiv_answers.pkl', 'wb'))
# pickle.dump(equations, open('../processed_data/ASDiv_equations.pkl', 'wb'))

In [50]:
questions = pickle.load(open('../processed_data/ASDiv_questions.pkl', 'rb'))
answers = pickle.load(open('../processed_data/ASDiv_answers.pkl', 'rb'))
equations = pickle.load(open('../processed_data/ASDiv_equations.pkl', 'rb'))

In [55]:
len(questions), len(answers), len(equations)

(2305, 2305, 2305)

In [57]:
prompt_complex = open('../data/multiarith/prompt_hardest_from_gsm8k.txt').read()
prompt_original = open('../data/multiarith/prompt_original_from_gsm8k.txt').read()
prompt_random = open('../data/multiarith/prompt_random_from_gsm8k.txt').read()

In [59]:
gpu_id = 'cuda:4'

In [60]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl").to(gpu_id)

In [63]:
q = questions[0]

In [68]:
prompt_q = prompt_original + '\nQuestion: ' + q + '\n'
prompt_q += "Let's think step by step\n"

In [70]:
answers[0]

'9 (apples)'

In [69]:
print(prompt_q)

Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Let's think step by step
There are 15 trees originally.
Then there were 21 trees after some more were planted.
So there must have been 21 - 15 = 6.
The answer is 6.

Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
Let's think step by step
There are originally 3 cars.
2 more cars arrive.
3 + 2 = 5.
The answer is 5.

Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
Let's think step by step
Originally, Leah had 32 chocolates.
Her sister had 42.
So in total they had 32 + 42 = 74.
After eating 35, they had 74 - 35 = 39.
The answer is 39.

Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
Let's

# Test Data with Original Prompt, 20.56

In [74]:
with open('../outputs/test_flan_t5_3b_asdiv_original.txt', 'w') as fd:
    for q, a in tqdm(zip(questions, answers), total=len(questions)):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to(gpu_id)
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2305/2305 [1:06:24<00:00,  1.73s/it]


In [78]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_asdiv_original.txt')

num_q 2305 correct 474 ratio 0.2056 skipped 55


# Test Data with Complex Prompt, TBC

In [13]:
with open('../outputs/test_flan_t5_3b_multiarith_complex.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_complex + 'Question: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [15:30<00:00,  2.33s/it]


In [14]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_complex.txt')

num_q 400 correct 95 ratio 0.2375


# Test Data with Random Prompt, TBC

In [15]:
with open('../outputs/test_flan_t5_3b_multiarith_random.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_random + 'Question: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [12:25<00:00,  1.86s/it]


In [16]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_random.txt')

num_q 400 correct 93 ratio 0.2325


# Test Data with Direct Answering, TBC

In [17]:
prompt_direct = open('../lib_prompt/prompt_direct.txt').read()

In [18]:
with open('../outputs/test_flan_t5_3b_multiarith_direct.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_direct + 'Question: ' + q + '\n'
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [07:27<00:00,  1.12s/it]


In [19]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_direct.txt')

num_q 400 correct 51 ratio 0.1275
