# Testing flan-t5 performance on multiarith 

In [1]:
import datasets
import torch
import re
import json

import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = json.load(open('../data/multiarith/MultiArith.json'))
dev_ind = np.load('../data/multiarith/validation_index.npy')
dev_data = [dataset[i] for i in dev_ind]
test_data = [d for i, d in enumerate(dataset) if i not in dev_ind]

In [3]:
prompt_complex = open('../data/multiarith/prompt_hardest_from_gsm8k.txt').read()
prompt_original = open('../data/multiarith/prompt_original_from_gsm8k.txt').read()
prompt_random = open('../data/multiarith/prompt_random_from_gsm8k.txt').read()

In [9]:
prompt = open('../lib_prompt/prompt_simple_4_cases.txt').read()

In [4]:
!nvidia-smi

Thu Dec 22 06:59:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:00:05.0 Off |                    0 |
| N/A   52C    P0   101W / 400W |  53207MiB / 81920MiB |     13%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:00:06.0 Off |                    0 |
| N/A   52C    P0   137W / 400W |  70769MiB / 81920MiB |     74%      Default |
|       

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("/mnt/data_10t/flan_t5_distill/checkpoints/0.0.2.1_epoch_0_iter_1000").to('cuda:2')

In [6]:
def test_answer(pred_str, ans_str):
    """Find the last number as the predicted answer"""
    pattern = '\d*\.?\d+'
    pred = re.findall(pattern, pred_str)
    if(len(pred) >= 1):
        # print(pred_str)
        pred = float(pred[-1])
        gold = re.findall(pattern, ans_str)
        # print(ans_str)
        gold = float(gold[-1])
        return pred == gold
    else: return False


def parse_pred_ans(filename, stop_at=-1):
    with open(filename) as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                if(test_answer(am, a)):
                    acc += 1
            current_mode = 'q'
            q = l
            num_q += 1
            if(num_q == stop_at): break
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    if(test_answer(am, a)):
        acc += 1
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold

# Test Data with Original Prompt, Acc 24

In [None]:
with open('../outputs/test_distilled_flan_t5_3b_multiarith.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_original + 'Q: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

 50%|███████████████████████████████████████████████████████████▊                                                           | 201/400 [13:42<20:28,  6.17s/it]

In [None]:
_, _, _ = parse_pred_ans('../outputs/test_distilled_flan_t5_3b_multiarith.txt')

# Test Data with Complex Prompt

In [13]:
with open('../outputs/test_flan_t5_3b_multiarith_complex.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_complex + 'Question: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [15:30<00:00,  2.33s/it]


In [14]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_complex.txt')

num_q 400 correct 95 ratio 0.2375


# Test Data with Random Prompt

In [15]:
with open('../outputs/test_flan_t5_3b_multiarith_random.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_random + 'Question: ' + q + '\n'
        prompt_q += "Let's think step by step\n"
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [12:25<00:00,  1.86s/it]


In [16]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_random.txt')

num_q 400 correct 93 ratio 0.2325


# Test Data with Direct Answering

In [17]:
prompt_direct = open('../lib_prompt/prompt_direct.txt').read()

In [18]:
with open('../outputs/test_flan_t5_3b_multiarith_direct.txt', 'w') as fd:
    for case in tqdm(test_data):
        q = case['sQuestion'][1:-1]
        a = case['lSolutions']
        
        prompt_q = prompt_direct + 'Question: ' + q + '\n'
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:2")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
        
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [07:27<00:00,  1.12s/it]


In [19]:
_, _, _ = parse_pred_ans('../outputs/test_flan_t5_3b_multiarith_direct.txt')

num_q 400 correct 51 ratio 0.1275
