# Testing Flan-T5 for gsm8k

In [1]:
import datasets
import torch
import re
import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
gsm8k = load_dataset('gsm8k', 'main')
gsm8k_test = gsm8k['test']

validation_index = np.load('lib_prompt/validation_index.npy')
validation_data = gsm8k['train'].select(validation_index)

Found cached dataset gsm8k (/data/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.68it/s]


In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl").to('cuda:1')

Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 792k/792k [00:00<00:00, 11.3MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2.20k/2.20k [00:00<00:00, 1.65MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2.54k/2.54k [00:00<00:00, 2.07MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 620kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 50.8k/50.8k [00:00<00:00, 870kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 9.45G/9.45G [01:45<00:00, 89.5MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████

In [6]:
!nvidia-smi

Fri Nov 11 03:20:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   34C    P0    66W / 400W |  61053MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   32C    P0    66W / 400W |  43433MiB / 81920MiB |      0%      Default |
|       

In [4]:
input_text = validation_data[0]['question']

In [5]:
prompt_complex = open('lib_prompt/prompt_complex.txt').read()

In [6]:
prompt_q = prompt_complex + '\nQuestion: ' + input_text + '\n'

In [11]:
print(prompt_q)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [13]:
# NOTE: max_model_input_size = 512, yet there is no barrier preventing one from using longer sequence 
# Also T5 uses relative position embedding which is designed for longer sequence

tokenizer.max_model_input_sizes

{'t5-small': 512, 't5-base': 512, 't5-large': 512, 't5-3b': 512, 't5-11b': 512}

In [8]:
input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:0")
input_ids.size()

Token indices sequence length is longer than the specified maximum sequence length for this model (2230 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1, 2230])

In [14]:
outputs = model.generate(input_ids, max_length=256)
print(tokenizer.decode(outputs[0]))

<pad> Let's think step by step The boy gives 12 / 3 = 4 oranges to his brother. He has 12 - 4 = 8 oranges left. The boy gives 8 / 4 = 2 oranges to his friend. The answer is 2</s>


In [16]:
outputs = model.generate(input_ids, max_length=256)
print(tokenizer.decode(outputs[0]))

<pad> Let's think step by step The boy gives 12 / 3 = 4 oranges to his brother. He has 12 - 4 = 8 oranges left. The boy gives 8 / 4 = 2 oranges to his friend. The answer is 2</s>


In [7]:
def test_answer(pred_str, ans_str):
    pattern = '\d*\.?\d+'
    pred = re.findall(pattern, pred_str)
    if(len(pred) >= 1):
        # print(pred_str)
        pred = pred[-1]
        gold = re.findall(pattern, ans_str)
        # print(ans_str)
        gold = gold[-1]
        return pred == gold
    else: return False

def parse_pred_ans(filename):
    with open(filename) as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                if(test_answer(am, a)):
                    acc += 1
            current_mode = 'q'
            q = l
            num_q += 1
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    if(test_answer(am, a)):
        acc += 1
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold

# Complex Prompt, Acc 21

In [23]:
i = 0
with open('outputs/dev_flan_t5_complex.txt', 'w') as fd:
    for q, a in tqdm(zip(validation_data['question'], validation_data['answer']), 
                               total=len(validation_data['question'])):
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'
        
        
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:0")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [14:13<00:00,  4.27s/it]


In [24]:
_, _, _ = parse_pred_ans('outputs/dev_flan_t5_complex.txt')

num_q 200 correct 42 ratio 0.2100


# Original Prompt, Acc 19.5

In [25]:
prompt_original = open('lib_prompt/prompt_original.txt').read()

In [26]:
i = 0
with open('outputs/dev_flan_t5_original.txt', 'w') as fd:
    for q, a in tqdm(zip(validation_data['question'], validation_data['answer']), 
                               total=len(validation_data['question'])):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:0")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [09:01<00:00,  2.71s/it]


In [27]:
_, _, _ = parse_pred_ans('outputs/dev_flan_t5_original.txt')

num_q 200 correct 39 ratio 0.1950


# Prompt Simple, Acc 22.0

In [28]:
prompt_simple = open('lib_prompt/prompt_simple.txt').read()

In [29]:
i = 0
with open('outputs/dev_flan_t5_simple.txt', 'w') as fd:
    for q, a in tqdm(zip(validation_data['question'], validation_data['answer']), 
                               total=len(validation_data['question'])):
        
        prompt_q = prompt_simple + '\nQuestion: ' + q + '\n'
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:0")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [09:09<00:00,  2.75s/it]


In [30]:
_, _, _ = parse_pred_ans('outputs/dev_flan_t5_simple.txt')

num_q 200 correct 44 ratio 0.2200


# Prompt Random, Acc 21.0

In [33]:
prompt_random = open('lib_prompt/prompt_random.txt').read()

In [34]:
i = 0
with open('outputs/dev_flan_t5_random.txt', 'w') as fd:
    for q, a in tqdm(zip(validation_data['question'], validation_data['answer']), 
                               total=len(validation_data['question'])):
        
        prompt_q = prompt_random + '\nQuestion: ' + q + '\n'
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:0")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [09:57<00:00,  2.99s/it]


In [35]:
_, _, _ = parse_pred_ans('outputs/dev_flan_t5_random.txt')

num_q 200 correct 42 ratio 0.2100


# Prompt Direct

In [8]:
prompt_direct = open('lib_prompt/prompt_direct.txt').read()

In [13]:
with open('outputs/dev_flan_t5_diret.txt', 'w') as fd:
    for q, a in tqdm(zip(validation_data['question'], validation_data['answer']), 
                               total=len(validation_data['question'])):
        
        prompt_q = prompt_direct + '\nQuestion: ' + q + '\n'
        input_ids = tokenizer(prompt_q, return_tensors="pt").input_ids.to("cuda:1")
        outputs = model.generate(input_ids, max_length=256)
        ans_ = tokenizer.decode(outputs[0])
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:21<00:00,  1.01s/it]


In [10]:
print(prompt_q)

Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
The answer is 6.

Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
The answer is 5.

Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
The answer is 39.

Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
The answer is 8.

Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
The answer is 9.

Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
The answer is 29.

Question: Michael had 58 golf balls. On tues

In [14]:
_, _, _ = parse_pred_ans('outputs/dev_flan_t5_diret.txt')

num_q 200 correct 14 ratio 0.0700
