## Scoring AGIEval using Zero-shot, Zero-shot-COT, few-shot, few-shot-COT

## few-shot

In [2]:
from src import post_process, utils, dataset_loader
from src import evaluation
import os
import re

dataset_name_list = [
'aqua-rat',
'math',
'logiqa-en',
'logiqa-zh',
'jec-qa-kd',
'jec-qa-ca',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'sat-math',
'sat-en',
'sat-en-without-passage',
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'gaokao-mathcloze',
]

dataset_dir = "data/v1"
raw_prompt_path = "./data/few_shot_prompts.csv"
chat_mode = True


output_dir = "./outputs-few-shot/gpt-4-32k/"
gpt_model = "gpt-4-32k"
setting_name = 'few-shot'
for dataset_name in dataset_name_list:
    
    dataset = dataset_loader.load_dataset(
                dataset_name, 'few-shot', dataset_dir,
                prompt_path=raw_prompt_path, max_tokens=2048,
                end_of_example="<END>\n", chat_mode=chat_mode)
    
    output_path = os.path.join(
                output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.jsonl')
    first_stage_output_path = os.path.join(
        output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.first_stage.jsonl')
    second_stage_input_path = os.path.join(
        output_dir, "inputs", f"{dataset_name}.{setting_name}.second_stage.jsonl")
    
    context_list = [item['context'] for item in dataset]

    result_for_human = dataset_loader.load_dataset_as_result_schema(
        dataset_name, dataset_dir
    )

    if 'few-shot' in setting_name:
        first_stage_output_jsons = utils.read_jsonl(first_stage_output_path)
        
    #print(dataset_name, len(result_for_human), len(first_stage_output_jsons))
    for i in range(len(result_for_human)):
        result_for_human[i].model_input = dataset[i]["context"]
        result_for_human[i].model_output = utils.extract_answer(first_stage_output_jsons[i])
        result_for_human[i].parse_result = post_process.post_process(dataset_name, setting_name, result_for_human[i].model_output)
        result_for_human[i].is_correct = evaluation.evaluate_single_sample(
            dataset_name, result_for_human[i].parse_result, result_for_human[i].label)
            
    correct_numer = len([item for item in result_for_human if item.is_correct])
    accuracy = correct_numer / len(result_for_human)
    
    #sum_list[setting_id] += accuracy
    print(f"{dataset_name:30}:  {round(accuracy*100, 2):5}")

aqua-rat                      :   50.0
math                          :   17.0
logiqa-en                     :   63.9
logiqa-zh                     :  64.98
jec-qa-kd                     :   39.9
jec-qa-ca                     :   37.0
lsat-ar                       :  33.91
lsat-lr                       :  86.67
lsat-rc                       :  87.36
sat-math                      :  71.82
sat-en                        :  91.75
sat-en-without-passage        :  67.48
gaokao-chinese                :  60.98
gaokao-english                :  92.81
gaokao-geography              :  74.87
gaokao-history                :  79.57
gaokao-biology                :   81.9
gaokao-chemistry              :  56.04
gaokao-physics                :   47.0
gaokao-mathqa                 :   43.3
gaokao-mathcloze              :   8.47


## few-shot COT

In [3]:
from src import post_process, utils, dataset_loader
from src import evaluation
import os
import re

dataset_name_list = [
'aqua-rat',
'math',
'logiqa-en',
'logiqa-zh',
'jec-qa-kd',
'jec-qa-ca',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'sat-math',
'sat-en',
'sat-en-without-passage',
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'gaokao-mathcloze',
]

dataset_dir = "data/v1"
raw_prompt_path = "./data/few_shot_prompts.csv"
chat_mode = True


output_dir = "./outputs-few-shot-cot/gpt-4-32k/"
gpt_model = "gpt-4-32k"
setting_name = 'few-shot-CoT'
for dataset_name in dataset_name_list:
    
    dataset = dataset_loader.load_dataset(
                dataset_name, 'few-shot', dataset_dir,
                prompt_path=raw_prompt_path, max_tokens=2048,
                end_of_example="<END>\n", chat_mode=chat_mode)
    
    output_path = os.path.join(
                output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.jsonl')
    first_stage_output_path = os.path.join(
        output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.first_stage.jsonl')
    second_stage_input_path = os.path.join(
        output_dir, "inputs", f"{dataset_name}.{setting_name}.second_stage.jsonl")
    
    context_list = [item['context'] for item in dataset]

    result_for_human = dataset_loader.load_dataset_as_result_schema(
        dataset_name, dataset_dir
    )

    if 'few-shot' in setting_name:
        first_stage_output_jsons = utils.read_jsonl(first_stage_output_path)
        
    #print(dataset_name, len(result_for_human), len(first_stage_output_jsons))
    for i in range(len(result_for_human)):
        result_for_human[i].model_input = dataset[i]["context"]
        result_for_human[i].model_output = utils.extract_answer(first_stage_output_jsons[i])
        result_for_human[i].parse_result = post_process.post_process(dataset_name, setting_name, result_for_human[i].model_output)
        result_for_human[i].is_correct = evaluation.evaluate_single_sample(
            dataset_name, result_for_human[i].parse_result, result_for_human[i].label)
            
    correct_numer = len([item for item in result_for_human if item.is_correct])
    accuracy = correct_numer / len(result_for_human)
    
    #sum_list[setting_id] += accuracy
    print(f"{dataset_name:30}:  {round(accuracy*100, 2):5}")

aqua-rat                      :  70.87
math                          :   26.9
logiqa-en                     :  59.91
logiqa-zh                     :  63.29
jec-qa-kd                     :   38.1
jec-qa-ca                     :   34.0
lsat-ar                       :  33.04
lsat-lr                       :  84.12
lsat-rc                       :  86.62
sat-math                      :  90.91
sat-en                        :  85.44
sat-en-without-passage        :  65.05
gaokao-chinese                :  54.47
gaokao-english                :  92.16
gaokao-geography              :  76.88
gaokao-history                :  81.28
gaokao-biology                :  72.86
gaokao-chemistry              :  55.07
gaokao-physics                :   59.0
gaokao-mathqa                 :   49.0
gaokao-mathcloze              :  13.56


## zero-shot

In [8]:
from src import post_process, utils, dataset_loader
from src import evaluation
import os
import re

dataset_name_list = [
'aqua-rat',
'math',
'logiqa-en',
'logiqa-zh',
'jec-qa-kd',
'jec-qa-ca',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'sat-math',
'sat-en',
'sat-en-without-passage',
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'gaokao-mathcloze',
]

# physics.
def find_first_capital_letter_en(answer):
    letter_set = {"A", "B", "C", "D", "E", "F"}
    
    try:
        l = re.findall(r'\(([A-F])\)', answer)

        return l[-1]
    except:
        return ""

dataset_dir = "data/v1"
raw_prompt_path = "./data/few_shot_prompts.csv"
chat_mode = True


output_dir = "./outputs-zero-shot/gpt-4-32k/"
gpt_model = "gpt-4-32k"
setting_name = 'zero-shot'
for dataset_name in dataset_name_list:
    
    dataset = dataset_loader.load_dataset(
                dataset_name, 'zero-shot', dataset_dir,
                prompt_path=raw_prompt_path, max_tokens=2048,
                end_of_example="<END>\n", chat_mode=chat_mode)
    
    output_path = os.path.join(
                output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.jsonl')
    first_stage_output_path = os.path.join(
        output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.first_stage.jsonl')
    second_stage_input_path = os.path.join(
        output_dir, "inputs", f"{dataset_name}.{setting_name}.second_stage.jsonl")
    
    context_list = [item['context'] for item in dataset]

    result_for_human = dataset_loader.load_dataset_as_result_schema(
        dataset_name, dataset_dir
    )

    if 'zero-shot' in setting_name:
        first_stage_output_jsons = utils.read_jsonl(first_stage_output_path)
        
    #print(dataset_name, len(result_for_human), len(first_stage_output_jsons))
    for i in range(len(result_for_human)):
        result_for_human[i].model_input = dataset[i]["context"]
        result_for_human[i].model_output = utils.extract_answer(first_stage_output_jsons[i])
        result_for_human[i].parse_result = post_process.post_process(dataset_name, setting_name, result_for_human[i].model_output)
        if dataset_name == 'gaokao-physics':
            result_for_human[i].parse_result = find_first_capital_letter_en(result_for_human[i].model_output)
            
        result_for_human[i].is_correct = evaluation.evaluate_single_sample(
            dataset_name, result_for_human[i].parse_result, result_for_human[i].label)
        
        if dataset_name == 'gaokao-mathcloze':
            result_for_human[i].is_correct = evaluation.evaluate_single_sample(
            dataset_name, result_for_human[i].parse_result.replace("$", ""), result_for_human[i].label.replace('$', ''))
            
    correct_numer = len([item for item in result_for_human if item.is_correct])
    accuracy = correct_numer / len(result_for_human)
    
    #sum_list[setting_id] += accuracy
    print(f"{dataset_name:30}:  {round(accuracy*100, 2):5}")

aqua-rat                      :  38.58
math                          :   20.1
logiqa-en                     :  55.91
logiqa-zh                     :  57.76
jec-qa-kd                     :   30.2
jec-qa-ca                     :   27.4
lsat-ar                       :  37.39
lsat-lr                       :   80.0
lsat-rc                       :  85.87
sat-math                      :  64.55
sat-en                        :  88.83
sat-en-without-passage        :  49.51
gaokao-chinese                :  54.07
gaokao-english                :  92.81
gaokao-geography              :  74.87
gaokao-history                :  74.47
gaokao-biology                :  75.71
gaokao-chemistry              :  49.76
gaokao-physics                :   39.0
gaokao-mathqa                 :  43.02
gaokao-mathcloze              :  18.64


## zero-shot-COT

In [7]:
from src import post_process, utils, dataset_loader
from src import evaluation
import os
import re

dataset_name_list = [
'aqua-rat',
'math',
'logiqa-en',
'logiqa-zh',
'jec-qa-kd',
'jec-qa-ca',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'sat-math',
'sat-en',
'sat-en-without-passage',
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'gaokao-mathcloze',
]

# lsat-lr, lsat-rc.
def find_last_capital_letter_en(answer):
    letter_set = {"A", "B", "C", "D", "E", "F"}
    
    try:
        l = re.findall(r'\(([A-F])\)', answer)
        if l[-1]:
            return l[-1]
        return find_last_capital_letter(answer)
    except:
        return ""
    
def find_first_capital_letter_en(answer):
    letter_set = {"A", "B", "C", "D", "E", "F"}
    
    try:
        for i in answer:
            if i in letter_set:
                return i
    except:
        return ""

dataset_dir = "data/v1"
raw_prompt_path = "./data/few_shot_prompts.csv"
chat_mode = True


output_dir = "./outputs-zero-shot-cot/gpt-4-32k/"
gpt_model = "gpt-4-32k"
setting_name = 'zero-shot-CoT'
for dataset_name in dataset_name_list:
    
    dataset = dataset_loader.load_dataset(
                dataset_name, 'zero-shot-CoT', dataset_dir,
                prompt_path=raw_prompt_path, max_tokens=2048,
                end_of_example="<END>\n", chat_mode=chat_mode)
    
    output_path = os.path.join(
                output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.jsonl')
    first_stage_output_path = os.path.join(
        output_dir, "outputs", f'predict.{gpt_model}.{dataset_name}.{setting_name}.first_stage.jsonl')
    second_stage_input_path = os.path.join(
        output_dir, "inputs", f"{dataset_name}.{setting_name}.second_stage.jsonl")
    
    context_list = [item['context'] for item in dataset]

    result_for_human = dataset_loader.load_dataset_as_result_schema(
        dataset_name, dataset_dir
    )

    if 'zero-shot' in setting_name:
        first_stage_output_jsons = utils.read_jsonl(output_path)
        
    #print(dataset_name, len(result_for_human), len(first_stage_output_jsons))
    for i in range(len(result_for_human)):
        result_for_human[i].model_input = dataset[i]["context"]
        result_for_human[i].model_output = utils.extract_answer(first_stage_output_jsons[i])
        result_for_human[i].parse_result = post_process.post_process(dataset_name, setting_name, result_for_human[i].model_output)
        
        if dataset_name in ['lsat-lr', 'lsat-rc', 'sat-en', 'logiqa-en']:
            result_for_human[i].parse_result = find_last_capital_letter_en(result_for_human[i].model_output)
            
        if dataset_name == 'gaokao-english':
            result_for_human[i].parse_result = find_first_capital_letter_en(result_for_human[i].model_output)
            
        result_for_human[i].is_correct = evaluation.evaluate_single_sample(
            dataset_name, result_for_human[i].parse_result, result_for_human[i].label)
        
        
    correct_numer = len([item for item in result_for_human if item.is_correct])
    accuracy = correct_numer / len(result_for_human)
    
    #sum_list[setting_id] += accuracy
    print(f"{dataset_name:30}:  {round(accuracy*100, 2):5}")

aqua-rat                      :   74.8
math                          :   40.7
logiqa-en                     :  57.91
logiqa-zh                     :   59.6
jec-qa-kd                     :   33.1
jec-qa-ca                     :   27.5
lsat-ar                       :  30.43
lsat-lr                       :  81.57
lsat-rc                       :  84.01
sat-math                      :  91.82
sat-en                        :  86.89
sat-en-without-passage        :  35.44
gaokao-chinese                :  49.59
gaokao-english                :   91.5
gaokao-geography              :  73.37
gaokao-history                :   76.6
gaokao-biology                :  71.43
gaokao-chemistry              :  53.62
gaokao-physics                :   49.5
gaokao-mathqa                 :  50.71
gaokao-mathcloze              :  10.17


In [22]:
for i in range(len(result_for_human)):
    print(result_for_human[i].model_output, "\n\npred:", result_for_human[i].parse_result, "\n\nlabel:", result_for_human[i].label, result_for_human[i].is_correct)
    print("\n", "="*50, "\n\n")

(B) finish your homework on time. 

pred: B 

label: B True



(D) know what the weather is like 

pred: D 

label: D True



(B) $299. 

pred: B 

label: B True



(C) On a teenage website. 

pred: C 

label: C True



(D) angry bees. 

pred: D 

label: D True



(B) needs to test more elephant groups. 

pred: B 

label: B True



(A) To record the sound of bees. 

pred: A 

label: A True



(C) Elephants do not go near trees with bees living in them. 

pred: C 

label: C True



(B) By showing what ordinary people have collected. 

pred: B 

label: B True



(A) Who they are. 

pred: A 

label: A True



(C) To study the significance of collecting. 

pred: C 

label: C True



(A) become adults. 

pred: A 

label: A True



(A) Changemakers. 

pred: A 

label: A True



(D) the poor people in Dhaka. 

pred: D 

label: D True



(C) tries to improve social conditions. 

pred: C 

label: C True



(D) Positive. 

pred: D 

label: D True



(A) Cherry Blossom Bike Tour in Washington, D.

In [None]:
+--------------------+--------+--------+
| Dataset            | Paper  |  Obser | 
+--------------------+--------+--------+
| AQuA-RAT           |  74.0  |  70.87 |
+--------------------+--------+--------+
| MATH               |  25.3  |  26.9  |
+--------------------+--------+--------+
| LogiQA (English)   |  62.7  |  59.91 |
+--------------------+--------+--------+
| LogiQA (Chinese)   |  61.9  |  63.29 |
+--------------------+--------+--------+
| JEC-QA-KD          |  40.4  |  38.1  |  
+--------------------+--------+--------+
| JEC-QA-CA          |  34.7  |  34.0  |  
+--------------------+--------+--------+
| LSAT-AR            |  31.7  |  33.91 |
+--------------------+--------+--------+
| LSAT-LR            |  84.5  |  84.12 |
+--------------------+--------+--------+
| LSAT-RC            |  87.7  |  86.62 |  
+--------------------+--------+--------+
| SAT-Math           |  89.6  |  90.91 |
+--------------------+--------+--------+
| SAT-English        |  85.9  |  85.44 |
+--------------------+--------+--------+
| SAT-English (w/o Psg.)62.6  |  65.05 |
+--------------------+--------+--------+
| GK-Cn              |  51.6  |  54.47 |
+--------------------+--------+--------+
| GK-En              |  93.1  |  92.16 |
+--------------------+--------+--------+
| GK-geography       |  76.4  |  76.88 |
+--------------------+--------+--------+
| GK-history         |  78.2  |  81.28 |
+--------------------+--------+--------+
| GK-biology         |  72.9  |  72.86 |
+--------------------+--------+--------+
| GK-chemistry       |  54.1  |  55.07 |
+--------------------+--------+--------+
| GK-physics         |  54.5  |  59.0  |
+--------------------+--------+--------+
| GK-Math-QA         |  49.1  |  49.0  |
+--------------------+--------+--------+
| GK-Math-Cloze      |  16.1  |  13.56 |
+--------------------+--------+--------+

In [None]:
[11, 16, 17, 57, 58,  63, 73, 74]