# gsm8k svamp tabmwp

In [53]:
from datasets import load_dataset

dataset_name = "tabmwp"
dataset = load_dataset("json",
                       data_files="/Users/ariete/Projects/self-improve/output/v2/tabmwp/critic_no_tool_1000_temperature_0.5_top-p_1.jsonl",
                       split="train")
# dataset = dataset.rename_column("Answer", "answer")
print(dataset, {dataset[11][k] for k in ["question", "predictions", "answer"]})

Generating train split: 0 examples [00:00, ? examples/s]

TypeError: unhashable type: 'list'

In [29]:

from typing import Union


def normalize_answer(answer: str) -> Union[float, None]:
    answer = str(answer).replace(",", "")
    digits = re.findall(r"-?\d+\.?\d*", answer)
    if digits:
        try:
            return floatify_ans(digits[-1])
        except (ValueError, IndexError):
            return None
    return None


def get_precision(gt_ans: float) -> int:
    return len(str(gt_ans).split('.')[-1]) if '.' in str(gt_ans) else 5


def finqa_equal(prediction: Union[bool, float, str],
                reference: Union[float, str],
                include_percentage: bool = True,
                is_close: bool = False) -> bool:
    if prediction is None:
        return False
    if isinstance(prediction, bool):
        return reference == 'yes' if prediction else reference == 'no'
    if isinstance(reference, str) or isinstance(prediction, str):
        return prediction == reference

    gt_result = [reference / 100, reference, reference * 100] if include_percentage else [reference]
    for item in gt_result:
        try:
            if is_close and isclose(item, prediction, rel_tol=0.001):
                return True
            precision = min(get_precision(prediction), get_precision(item))
            if round(prediction, precision) == round(item, precision):
                return True
        except Exception:
            continue
    return False


def round_with_error(x: float) -> float:
    return round(x * 1e5) / 1e5


def floatify_ans(ans: Union[str, dict, bool, list, tuple]) -> Union[float, str, None]:
    if ans is None:
        return None
    if isinstance(ans, dict):
        ans = list(ans.values())[0]
    if isinstance(ans, (list, tuple)):
        return float(ans[0]) if ans else None
    try:
        return round_with_error(float(ans))
    except ValueError:
        return str(ans)

In [38]:
from decimal import Decimal, InvalidOperation
from fractions import Fraction
import re


def clean_number(value):
    """
    去除数字中的逗号并返回一个干净的字符串
    """
    return re.sub(r",", "", value)


def is_fraction(value):
    """判断是否为分数格式"""
    try:
        # 尝试将字符串转换为 Fraction
        Fraction(value)
        return True
    except ValueError:
        return False


def is_number(value):
    """判断是否为整数或小数"""
    try:
        # 去除逗号后使用 Decimal 判断是否为合法的数字
        Decimal(clean_number(value))
        return True
    except InvalidOperation:
        return False


def to_float(value):
    """
    将字符串转换为浮点数
    支持整数、小数和分数
    """
    if is_fraction(value):
        return float(Fraction(value))  # 将分数转换为浮点数
    elif is_number(value):
        return float(Decimal(clean_number(value)))  # 将整数或小数转换为浮点数
    else:
        raise ValueError(f"Cannot convert {value} to float")


def validate_prediction(prediction, answer):
    """
    验证单条数据的 prediction 和 answer 是否匹配
    处理小数、整数、分数、文本
    """
    # 尝试将数字或分数转换为浮点数进行比较
    try:
        rounded_prediction = round(to_float(prediction), 3)
        rounded_answer = round(to_float(answer), 3)
        return rounded_prediction == rounded_answer
    except ValueError:
        # 如果不是数字或分数，按文本比较
        return prediction.strip().lower() == answer.strip().lower()

In [56]:
scores = []

from langchain_experimental.utilities import PythonREPL

python = PythonREPL()


def exract_code(text: str):
    pattern = r'```python(.*?)```'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return text


for idx, sample in enumerate(dataset):
    predictions: list[str] = sample["predictions"]
    if any("FINAL ANSWER:" in prediction for prediction in predictions):
        predictions = [prediction.split("FINAL ANSWER:")[-1].strip() for prediction in predictions]
    else:
        predictions = [python.run(exract_code(prediction)) for prediction in predictions]

    while len(predictions) < 4:
        predictions.append(predictions[-1])

    # predictions = [normalize_answer(prediction) for prediction in predictions]

    # normalized_answer = sample["answer"]
    is_correct = [validate_prediction(prediction, sample["answer"]) for prediction in predictions]
    if not any(is_correct):
        print(sample["predictions"][-1])
        print(idx, predictions, sample["answer"], is_correct, "\n\n")
    is_correct.append(any(is_correct))
    scores.append(is_correct)

accuracies = [sum(col) / len(col) if len(col) > 0 else 0 for col in zip(*scores)]
print(accuracies)


# Python code, return answer
weight_change_sprinkles = 5
weight_change_champ = -6

# Calculate the absolute weight change for both dogs
abs_weight_change_sprinkles = abs(weight_change_sprinkles)
abs_weight_change_champ = abs(weight_change_champ)

# Determine which dog's weight has changed the most and the implications of each change
if abs_weight_change_sprinkles > abs_weight_change_champ:
    answer = ("Sprinkles gained weight (5 oz), which is concerning as it suggests overeating "
              "and could lead to health issues if not addressed.")
elif abs_weight_change_champ > abs_weight_change_sprinkles:
    answer = ("Champ lost weight (-6 oz), which is concerning as it may indicate health problems "
              "or insufficient food intake.")
else:
    answer = "Both dogs have the same absolute weight change."

# Emphasizing the significance of each dog's weight change in the context of Jordan's concern.
print(answer)
11 ['Champ lost weight\n', 'Champ lost weight, which is conce