part3/matheval_3.py

import re
import json
import torch
import os
from statistics import mean
from cot_prompts import get_examples
from vllm import LLM, SamplingParams

if not os.path.exists("results"):
    os.makedirs("results")


def get_seperation_trigger(dataset: str):
    triggers = ['The answer is:', 'The answer is', 'the answer is']
    # the answer format of gsm8k is a bit different
    if dataset == 'gsm8k':
        triggers.append('####')
    return triggers


def answer_clean(dataset, pred):
    direct_answer_trigger_for_fewshot = get_seperation_trigger(dataset)
    pred = pred.strip('\n')

    # Determine if this is in-context learning (ICL), if so, use \n\n to split the first chunk.
    # NOTE: in-context learning means that we add additional sample (question, answer) pairs into the prompt, which
    # may boost the performance of the model
    ICL = False
    for trigger in direct_answer_trigger_for_fewshot:
        if pred.count(trigger) > 1:
            ICL = True
    if ICL:
        pred = pred.split('\n\n')[0]

    # Split the trigger to find the answer.
    preds = re.split('|'.join(direct_answer_trigger_for_fewshot), pred)
    if len(preds) > 1:
        answer_flag = True
        pred = preds[-1]
    else:
        answer_flag = False

    pred = pred.strip('\n').rstrip('.').rstrip('/').strip(' ')

    # 替换多个空格为一个空格
    # pred = re.sub(r'\s+', ' ', pred)
    # 去除特殊字符，保留字母、数字、空格、斜杠、点和短横线
    # pred = re.sub(r'[^\w\s/.-]', '', pred)

    # 根据数据集应用特定的清理逻辑
    if dataset == "math":
        # 数学问题的特定清理逻辑
        pred = re.sub(r'\b(\d+)(th|st|nd|rd)\b', r'\1', pred)  # 去除序数后缀
        pred = re.sub(r'\$', '', pred)  # 去除数学公式中的美元符号
    elif dataset == "gsm8k":
        # GSM8K数据集可能需要去除特定的格式或标记
        pred = re.sub(r'####', '', pred)  # 去除GSM8K答案中的分隔符
    elif dataset in ("aqua", "sat", "arc", "mmlu") or "mmlu" in dataset:
        # 选择题数据集可能需要提取选项字母
        pred = re.findall(r'[A-Z]', pred)
        if pred:
            pred = ' '.join(pred)  # 将选项字母连接成字符串
        else:
            pred = pred.strip().strip('.')  # 去除多余的标点
    elif dataset in ("numglue", "svamp", "deepmind", "simuleq"):
        # 数值答案数据集可能需要去除逗号和小数点后的零
        pred = pred.replace(",", "")
        pred = re.sub(r'\.0+$', '', pred)  # 去除小数点后多余的零
    elif dataset in ("theoremqa",):
        # 定理问答数据集可能需要特定的格式处理
        pass  # 添加特定逻辑
    elif "bbh" in dataset:
        # 特定数据集的清理逻辑
        pass  # 添加特定逻辑
    else:
        raise ValueError("dataset is not properly defined ...")

    # 尝试提取数值答案
    num_match = re.search(r'-?\d+\.?\d*', pred)
    if num_match:
        pred = num_match.group(0)  # 只保留匹配到的数字部分

    # If there is no candidate in list, null is set.
    if len(pred) == 0:
        pred = ""
    else:
        if answer_flag:
            # choose the first element in list ...
            pred = pred[0]
        else:
            # choose the last e
            pred = pred[-1]

    pred = pred.rstrip('.').rstrip('/')
    return pred


def data_reader(dataset: str, base_path="data/math"):
    """read the validation dataset

    Return:
        questions: list(str) questions
        answers: list(str) groundtruth answers
    """

    questions = []
    answers = []
    decoder = json.JSONDecoder()

    if dataset == "aqua":
        with open(f'{base_path}/AQuA/AQuA.json') as f:
            lines = f.readlines()
            for line in lines:
                json_res = decoder.raw_decode(line)[0]
                choice = "(" + "(".join(json_res["options"])
                choice = choice.replace("(", " (").replace(")", ") ")
                choice = "Answer Choices:" + choice
                questions.append(json_res["question"].strip() + "\n" + choice)
                answers.append(json_res["correct"])
    elif dataset == 'math':
        with open(f'{base_path}/math/MATH.json', 'r') as f:
            loaded = json.load(f)
        for d in loaded:
            questions.append(d['question'])
            answers.append(d['answer'])
    elif dataset == "gsm8k":
        with open(f'{base_path}/gsm8k/gsm8k.jsonl') as f:
            lines = f.readlines()
            for line in lines:
                json_res = decoder.raw_decode(line)[0]
                questions.append(json_res["question"].strip())
                answers.append(delete_extra_zero(json_res["answer"].split("#### ")[-1].replace(",", "")))
    elif dataset == "svamp":
        with open(f'{base_path}/SVAMP/SVAMP.json') as f:
            json_data = json.load(f)
            for line in json_data:
                q = line["Body"].strip() + " " + line["Question"].strip()
                a = str(line["Answer"])
                if a[-2:] == ".0":
                    a = a[:-2]
                questions.append(q)
                answers.append(delete_extra_zero(a))
    elif dataset == 'theoremqa':
        with open(f'{base_path}/theoremqa/theoremqa_test.json') as f:
            test_set = json.load(f)
            for row in test_set:
                questions.append(row['Question'])
                if isinstance(row['Answer'], bool):
                    answers.append([str(row['Answer']), None])
                elif isinstance(row['Answer'], (list, int, float)):
                    answers.append([str(row['Answer']), row['Answer']])
                else:
                    answers.append([str(row['Answer']), None])
    elif dataset == 'arc':
        with open(f'{base_path}/arc/challenge.json') as f:
            test_set = json.load(f)
            for row in test_set:
                questions.append(row['question'])
                answers.append(row['answer'])
    elif dataset == 'mmlu_pro':
        with open(f'{base_path}/mmlu_pro/test.json') as f:
            json_data = json.load(f)
            for line in json_data:
                questions.append(line['question'])
                answers.append(line['answer'])
    elif 'mmlu' in dataset:
        with open(f'{base_path}/mmlu/{dataset.split("_")[1]}.json') as f:
            json_data = json.load(f)
            for line in json_data:
                options = f'(A) {line["choices"][0]} (B) {line["choices"][1]} (C) {line["choices"][2]} (D) {line["choices"][3]}'
                q = line["question"] + '\n' + 'Answer Choices: ' + options
                a = ['A', 'B', 'C', 'D'][line['answer']]
                questions.append(q)
                answers.append(a)
    elif dataset in ['numglue', 'simuleq', 'deepmind', 'sat']:
        with open(f'{base_path}/{dataset}/{dataset}.json') as f:
            json_data = json.load(f)
            for line in json_data:
                assert isinstance(line['question'], str) and isinstance(line['question'], str), line
                questions.append(line['question'])
                answers.append(str(line['answer']))
    elif 'gpqa' in dataset:
        with open(f'{base_path}/gpqa/{dataset}.jsonl') as f:
            lines = f.readlines()
            for line in lines:
                data = json.loads(line)
                tmp = generate_question_and_answers(data)
                questions.append(tmp['question'])
                answers.append(tmp['answer'])
    elif 'bbh' in dataset:
        with open(f'{base_path}/bbh/bbh.json', 'r') as f:
            test_set = json.load(f)
        for entry in test_set:
            questions.append(entry['question'])
            answers.append(entry['answer'])
    else:
        raise ValueError("dataset is not properly defined ...")

    q_len_list = []
    for q in questions:
        q_len_list.append(len(q.split(" ")))
    q_len_mean = mean(q_len_list)

    print("dataset : {}".format(dataset))
    print("data size : {}".format(len(answers)))
    print("average num of words for each sample : {}".format(q_len_mean))

    return questions, answers


def delete_extra_zero(n):
    try:
        n = float(n)
    except:
        try:
            n = eval(n)
        except:
            print("Conversion to floating number fails: {}".format(n))
            return n
    if isinstance(n, int):
        return str(n)
    if isinstance(n, float):
        n = str(n).rstrip('0')  # 删除小数点后多余的0
        n = int(n.rstrip('.')) if n.endswith('.') else float(n)  # 只剩小数点直接转int，否则转回float
        n = str(n)
        return n


def compare_answer_with_groundtruth(answer, groundtruth_str, groundtruth_num=None):
    # 忽略大小写
    answer = answer.strip().lower()
    groundtruth_str = groundtruth_str.strip().lower()

    # 清除可能的文本符号
    symbols = ['\\text{', '\\boxed{']
    for symbol in symbols:
        if symbol in answer:
            answer = answer.replace(symbol, '').rstrip('}')
        if symbol in groundtruth_str:
            groundtruth_str = groundtruth_str.replace(symbol, '').rstrip('}')

    # 数值比较，使用容差
    if groundtruth_num is not None:
        try:
            answer_num = float(answer)
            groundtruth_num = float(groundtruth_num)
            # 使用容差比较浮点数
            return abs(answer_num - groundtruth_num) < 0.01
        except ValueError:
            # 如果转换为浮点数失败，尝试其他比较方式
            pass

    # 处理选择题答案
    if re.match(r'[a-z]', answer) and re.match(r'[a-z]', groundtruth_str):
        # 确保选项格式一致，例如将小写转换为大写
        return answer.upper() == groundtruth_str.upper()

    # 处理数字答案
    if re.match(r'-?\d+(?:\.\d+)?', answer) and re.match(r'-?\d+(?:\.\d+)?', groundtruth_str):
        answer_num = float(answer)
        groundtruth_num = float(groundtruth_str)
        return abs(answer_num - groundtruth_num) < 0.01

    # 对于非选择题和非数值答案，直接比较字符串
    return answer.lower() == groundtruth_str.lower()


# 辅助函数，用于从字符串中提取数字
def number_it(s):
    try:
        return float(s)
    except ValueError:
        return s


def compare_two_numbers(p, gt):
    try:
        if math.isnan(p):
            return False
        if isinstance(gt, int):
            return round(p) == gt
        else:
            return within_eps(pred=p, gt=gt)
    except Exception:
        return False


def compare_two_list(pred, gt):
    if not isinstance(pred, list):
        return False
    elif len(pred) != len(gt):
        return False
    elif any([not isinstance(x, (int, float)) for x in pred]):
        return False
    else:
        pred = sorted(pred)
        gt = sorted(gt)
        return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)])


dataset = 'gsm8k'  # options: 'gsm8k' 'math' 'svamp' 'simuleq' 'numglue'
template = "### Instruction:\n{}\n\n### Response:{}\n\n"
num_samples = 50  # -1 means all samples
model_name = "meta-llama/Llama-3.2-3B-Instruct"

inference_dtype = "bfloat16" if torch.cuda.is_bf16_supported() else "float32"
# Adjusting gpu_memory_utilization and tensor_parallel_size
gpu_memory_utilization = 0.9  # Increase this value to utilize more GPU memory
tensor_parallel_size = torch.cuda.device_count()  # Use all available GPUs

model = LLM(
    model=model_name,
    tensor_parallel_size=tensor_parallel_size,
    dtype=inference_dtype,
    trust_remote_code=True,
    max_model_len=4096,
    gpu_memory_utilization=gpu_memory_utilization  # Add this parameter
)
questions, groundtruths = data_reader(dataset)

num_shots = 1
cot_examples = get_examples(task=dataset, num_shots=num_shots)
cot_prompt = ""
for example in cot_examples:
    question, answer = example
    cot_prompt += template.format(question, answer) + "\n\n"

prompts = [cot_prompt + "### Instruction:\n{}\n\n### Response:".format(q) for q in questions[:num_samples]]
outputs = model.generate(prompts, SamplingParams(temperature=0.8, top_p=0.95, max_tokens=500))

correct, wrong = 0, 0
for i, output in enumerate(outputs):
    result = {}
    prompt = output.prompt
    response = output.outputs[0].text

    # extract answer from the generated text
    # TODO: improve the answer_clean function
    model_answer = answer_clean(dataset, response)

    # TODO: improve the compare_answer_with_groundtruth function
    if compare_answer_with_groundtruth(model_answer, groundtruths[i]):
        correct += 1
        result["correctness"] = True
    else:
        wrong += 1
        result["correctness"] = False

    result["model_answer"] = model_answer
    result["groundtruth"] = groundtruths[i]
    result["prompt"] = prompt
    result["response"] = response

    with open(f"results/{model_name.split('/')[-1]}_{dataset}.jsonl", "a") as f:
        json.dump(result, f)
        f.write("\n")

accuracy = correct / (correct + wrong)
print(f"Accuracy: {accuracy}")