In [1]:
# 安装特定版本的必要包
!pip install fsspec==2023.10.0  # 调整为有效版本
!pip install transformers datasets==2.14.0 accelerate pandas
!pip install -U evaluate peft bitsandbytes

Collecting fsspec==2023.10.0
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2023.10.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2023.10.0
Collecting datasets==2.14.0
  Downloading datasets-2.14.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.0)
  Downloading dill-0.3.7-py3-none-any.whl.meta

In [2]:
import os
import shutil
import os
import json
import re
import numpy as np
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
from evaluate import load
from peft import LoraConfig, get_peft_model
import torch
import pandas as pd

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32,expandable_segments:True"

In [3]:

# Step 1: 挂载Google Drive
from google.colab import drive
drive.mount('/content/drive')

source_dir = '/content/drive/My Drive/Colab Notebooks'
public_tar = os.path.join(source_dir, 'MATH.tar')
model_tar = os.path.join(source_dir, 'fine-tuned-qwen2.5-cotfull.tar')
destination_dir = '/content'
shutil.copy(public_tar, destination_dir)
shutil.copy(model_tar, destination_dir)

Mounted at /content/drive


'/content/fine-tuned-qwen2.5-cotfull.tar'

In [4]:
# 解压MATH数据集
!tar -xf MATH.tar
# 如果需要，解压预训练/微调的Qwen模型
!tar -xf fine-tuned-qwen2.5-cotfull.tar

In [5]:
# 常量
IGNORE_INDEX = -100

# 定义特殊token（保留用于提示模板，但不添加到 tokenizer）
DEFAULT_PAD_TOKEN = "<|endoftext|>"
DEFAULT_BOS_TOKEN = "<|im_start|>"
DEFAULT_EOS_TOKEN = "<|im_end|>"
DEFAULT_UNK_TOKEN = "<|extra_0|>"

# 定义提示模板（移除 {text} 占位符）
PROMPT_DICT = {
    "prompt_input": (
        f"{DEFAULT_BOS_TOKEN}system\n"
        "You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.\n"
        f"{DEFAULT_EOS_TOKEN}\n"
        f"{DEFAULT_BOS_TOKEN}user\n"
        "Problem: {problem}\n"
        f"{DEFAULT_EOS_TOKEN}\n"
        f"{DEFAULT_BOS_TOKEN}assistant\n"
        "Here is the step-by-step solution:\n"
    ),
    "prompt_output": (
        "{answer}" + f"{DEFAULT_EOS_TOKEN}"
    )
}


In [6]:
def load_math_dataset(root_dir):
    """
    加载数学数据集，转换为适合模型训练的格式。

    参数：
    - root_dir (str): 包含各类别 JSON 文件的根目录。

    返回：
    - list of dict: 每个字典包含 'prompt' 和 'response' 字段。
    """
    data = []
    categories = [
        'algebra',
        'counting_and_probability',
        'geometry',
        'intermediate_algebra',
        'number_theory',
        'prealgebra',
        'precalculus'
    ]
    for category in categories:
        category_dir = Path(root_dir) / category
        for json_file in category_dir.glob('*.json'):
            with open(json_file, 'r', encoding='utf-8') as f:
                entry = json.load(f)
                problem = entry.get('problem', '').strip()
                solution = entry.get('solution', '').strip()
                if problem and solution:
                    prompt = PROMPT_DICT["prompt_input"].format(
                        problem=problem
                    )
                    response = PROMPT_DICT["prompt_output"].format(
                        answer=solution
                    )
                    data.append({"prompt": prompt, "response": response})
    return data

# 加载训练和测试数据
train_data = load_math_dataset('/content/MATH/train')
test_data = load_math_dataset('/content/MATH/test')

print(f"训练集样本数: {len(train_data)}")
print(f"测试集样本数: {len(test_data)}")

# 检查第一个训练样本
print(train_data[0])

训练集样本数: 7500
测试集样本数: 5000
{'prompt': '<|im_start|>system\nYou are a logical reasoning expert. Analyze the following problem and provide a detailed solution.\n<|im_end|>\n<|im_start|>user\nProblem: Evaluate $\\left\\lfloor \\left\\lceil \\left(\\frac{13}{7}\\right)^2\\right\\rceil+\\frac{17}{4}\\right\\rfloor$.\n<|im_end|>\n<|im_start|>assistant\nHere is the step-by-step solution:\n', 'response': 'We know that $\\left(\\frac{13}{7}\\right)^2=\\frac{169}{49}$. Then, since $3=\\frac{147}{49}<\\frac{169}{49}<\\frac{196}{49}=4$, we conclude that $\\left\\lceil\\left(\\frac{13}{7}\\right)^2\\right\\rceil=4$. Because $4+\\frac{17}{4}=\\frac{33}{4}$, which is a number between $8$ and $9$, $\\left\\lfloor \\left\\lceil \\left(\\frac{13}{7}\\right)^2\\right\\rceil+\\frac{17}{4}\\right\\rfloor=\\boxed{8}$.<|im_end|>'}


In [7]:
# 将列表转换为pandas DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# 创建Hugging Face数据集
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# 可选：打乱并选择子集，以便更快地进行实验
# 如果你想使用较小的子集，请取消注释以下行

# train_dataset = train_dataset.shuffle(seed=42).select(range(len(train_dataset) // 5))  # 使用20%的数据
# test_dataset = test_dataset.shuffle(seed=42).select(range(len(test_dataset) // 500))  # 使用0.2%的数据

# 打印数据集大小
print(f"训练数据集大小: {len(train_dataset)}")
print(f"测试数据集大小: {len(test_dataset)}")

# 检查第一个训练样本
print(train_dataset[19])

训练数据集大小: 7500
测试数据集大小: 5000
{'prompt': '<|im_start|>system\nYou are a logical reasoning expert. Analyze the following problem and provide a detailed solution.\n<|im_end|>\n<|im_start|>user\nProblem: Suppose $x$ and $y$ are integers such that  $xy+5x+4y=-5$.  Find the greatest possible value of $y$.\n<|im_end|>\n<|im_start|>assistant\nHere is the step-by-step solution:\n', 'response': "Note that $(x+4)(y+5)$ equals $xy+5x+4y+20$.  So, add $20$ to both sides of the original equation to get $xy+5x+4y+20=15$, so now we may apply Simon's Favorite Factoring Trick and write the equation as $(x+4)(y+5)=15$.\n\nThen, the potential ordered pairs $((x+4),(y+5))$ with $x<y$ are $(-15,-1)$, $(-5,-3)$, $(1,15)$ and $(3,5)$, since these are the pairs of integers that multiply to 15.  The greatest value for $y+5$ is thus $15$.  We solve $y+5=15$ for $y$ to yield $y=\\boxed{10}$.<|im_end|>"}


In [8]:
# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained("fine-tuned-qwen2.5-cot")

# 加载预训练模型
model = AutoModelForCausalLM.from_pretrained(
    "fine-tuned-qwen2.5-cot",
    device_map="auto"
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [9]:
# 定义LoRA配置
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # 根据模型架构调整
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# 将LoRA应用于模型
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184


In [10]:
def tokenize_function(example):
    # 拼接 prompt 和 response
    input_text = example["prompt"] + example["response"]

    # 对输入进行分词
    encoding = tokenizer(
        input_text,
        truncation=True,
        max_length=256,  # 根据需要调整最大长度
        padding="max_length",
        return_tensors="pt"
    )

    labels = encoding["input_ids"].clone()

    # 计算 prompt 部分的长度
    prompt_len = len(tokenizer.encode(example["prompt"], add_special_tokens=False))

    # 将 prompt 部分的 labels 设置为 IGNORE_INDEX
    labels[:, :prompt_len] = IGNORE_INDEX
    encoding["labels"] = labels

    return {
        "input_ids": encoding["input_ids"].squeeze(),
        "attention_mask": encoding["attention_mask"].squeeze(),
        "labels": labels.squeeze()
    }

# 将分词函数应用于数据集
tokenized_train = train_dataset.map(tokenize_function, batched=False)
# 仅使用部分验证数据
test_dataset = test_dataset.shuffle(seed=42).select(range(200))  # 使用前200个样本
tokenized_test = test_dataset.map(tokenize_function, batched=False)

# 移除不必要的列以加快训练
tokenized_train = tokenized_train.remove_columns(["prompt", "response"])
tokenized_test = tokenized_test.remove_columns(["prompt", "response"])

# 设置数据集格式为 PyTorch tensors
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=22,  # 根据GPU显存调整
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    fp16=True
)

# 加载准确率评估指标
metric = load("accuracy")



In [34]:
import re
import numpy as np

# 编译正则表达式
pattern = re.compile(r'\\boxed\{([^{}]+)\}')

def extract_boxed_number(text):
    """提取字符串中最后一个被boxed包围的内容"""
    matches = pattern.findall(text)
    return matches[-1] if matches else None

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # If predictions are logits, take the argmax to get token IDs
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)

    # Replace IGNORE_INDEX with pad_token_id in labels
    # Perform this operation in place to save memory
    labels = np.where(labels != IGNORE_INDEX, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Use list comprehensions to extract boxed numbers efficiently
    pred_answers = [extract_boxed_number(pred) for pred in decoded_preds]
    label_answers = [extract_boxed_number(label) for label in decoded_labels]

    # Zip the answers together and filter out any pairs where either is None
    valid_pairs = [
        (pred_ans, label_ans)
        for pred_ans, label_ans in zip(pred_answers, label_answers)
        if pred_ans is not None and label_ans is not None
    ]

    # Compute the number of correct predictions
    correct = sum(pred_ans == label_ans for pred_ans, label_ans in valid_pairs)
    total = len(valid_pairs)
    accuracy = correct / total if total > 0 else 0.0

    return {"accuracy": accuracy}

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
for i in range(5):
    sample = tokenized_train[i]
    print(f"Sample {i}:")

    # 将 input_ids 直接解码没问题
    print("\nDecoded Input:", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))

    # 处理 labels，将 -100 替换为 pad_token_id
    labels = sample["labels"].tolist()
    labels = [token if token != IGNORE_INDEX else tokenizer.pad_token_id for token in labels]
    print("Decoded Labels:", tokenizer.decode(labels, skip_special_tokens=True))
    print("-" * 50)


Sample 0:

Decoded Input: system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: Evaluate $\left\lfloor \left\lceil \left(\frac{13}{7}\right)^2\right\rceil+\frac{17}{4}\right\rfloor$.

assistant
Here is the step-by-step solution:
We know that $\left(\frac{13}{7}\right)^2=\frac{169}{49}$. Then, since $3=\frac{147}{49}<\frac{169}{49}<\frac{196}{49}=4$, we conclude that $\left\lceil\left(\frac{13}{7}\right)^2\right\rceil=4$. Because $4+\frac{17}{4}=\frac{33}{4}$, which is a number between $8$ and $9$, $\left\lfloor \left\lceil \left(\frac{13}{7}\right)^2\right\rceil+\frac{17}{4}\right\rfloor=\boxed{8}$.
Decoded Labels: We know that $\left(\frac{13}{7}\right)^2=\frac{169}{49}$. Then, since $3=\frac{147}{49}<\frac{169}{49}<\frac{196}{49}=4$, we conclude that $\left\lceil\left(\frac{13}{7}\right)^2\right\rceil=4$. Because $4+\frac{17}{4}=\frac{33}{4}$, which is a number between $8$ and $9$, $\left\lfloor \left\lceil \left(\fra

In [16]:
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.6608,0.623175
1,0.6334,0.601873
2,0.6099,0.595643
4,0.6054,0.589077
5,0.6206,0.587303
6,0.6073,0.58569
8,0.6126,0.583601
9,0.6083,0.582881
10,0.6026,0.582217
12,0.5715,0.581472


TrainOutput(global_step=1700, training_loss=0.6518233063641716, metrics={'train_runtime': 1591.8959, 'train_samples_per_second': 94.227, 'train_steps_per_second': 1.068, 'total_flos': 8.24673567717458e+16, 'train_loss': 0.6518233063641716, 'epoch': 19.941348973607038})

In [18]:
# 保存微调后的模型
trainer.save_model("fine-tuned-qwen2.5-cot-2")

# 保存tokenizer
tokenizer.save_pretrained("fine-tuned-qwen2.5-cot-2")

('fine-tuned-qwen2.5-cot-2/tokenizer_config.json',
 'fine-tuned-qwen2.5-cot-2/special_tokens_map.json',
 'fine-tuned-qwen2.5-cot-2/vocab.json',
 'fine-tuned-qwen2.5-cot-2/merges.txt',
 'fine-tuned-qwen2.5-cot-2/added_tokens.json',
 'fine-tuned-qwen2.5-cot-2/tokenizer.json')

In [40]:
# 压缩微调后的模型目录
!tar -cvf fine-tuned-qwen2.5-cotfull-2.tar fine-tuned-qwen2.5-cot-2
source_dir = '/content/drive/My Drive/Colab Notebooks'
shutil.copy("/content/fine-tuned-qwen2.5-cotfull-2.tar", source_dir)

fine-tuned-qwen2.5-cot-2/
fine-tuned-qwen2.5-cot-2/vocab.json
fine-tuned-qwen2.5-cot-2/README.md
fine-tuned-qwen2.5-cot-2/adapter_model.safetensors
fine-tuned-qwen2.5-cot-2/added_tokens.json
fine-tuned-qwen2.5-cot-2/adapter_config.json
fine-tuned-qwen2.5-cot-2/tokenizer.json
fine-tuned-qwen2.5-cot-2/tokenizer_config.json
fine-tuned-qwen2.5-cot-2/special_tokens_map.json
fine-tuned-qwen2.5-cot-2/training_args.bin
fine-tuned-qwen2.5-cot-2/merges.txt


'/content/drive/My Drive/Colab Notebooks/fine-tuned-qwen2.5-cotfull-2.tar'

In [33]:
from transformers import GenerationConfig

# Method 1: Set when creating config
generation_config = GenerationConfig(
    max_new_tokens=256  # Will generate at most 100 new tokens
)
# 选择几个验证集样本进行预测展示
num_examples = 5  # 展示前5个样本
test_samples = test_dataset.select(range(num_examples))

# 设置模型为评估模式
model.eval()

print("\n=== 模型预测结果与真实答案对比 ===\n")

with torch.no_grad():
    for idx, sample in enumerate(test_samples):
        # 准备输入
        prompt = sample['prompt']
        true_response = sample['response']

        # 对输入进行编码
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to(model.device)

        # 生成预测
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_return_sequences=1,
            temperature=0.4,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            generation_config=generation_config
        )

        # 解码预测结果
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 移除prompt部分，只保留模型的回答
        predicted_answer = predicted_text[len(prompt):]

        print(f"\n示例 {idx + 1}:")
        print("\n问题:")
        print(sample['prompt'].replace(DEFAULT_BOS_TOKEN, "").replace(DEFAULT_EOS_TOKEN, ""))
        print("\n模型预测答案:")
        print(predicted_answer)
        print("\n真实答案:")
        print(sample['response'].replace(DEFAULT_EOS_TOKEN, ""))
        print("\n" + "="*80)

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== 模型预测结果与真实答案对比 ===



Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



示例 1:

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: When two consecutive whole numbers are randomly selected, what is the probability that one of them is a multiple of 4? Express your answer as a common fraction.

assistant
Here is the step-by-step solution:


模型预测答案:
s of 4, and 10 numbers between 1 and 100 that are not multiples of 4.  So the probability that one of the two numbers is a multiple of 4 is $\boxed{\frac{1}{2}}$.  (Note that if both numbers are multiples of 4, then the probability that one of them is a multiple of 4 is $\frac{1}{2}$ as well.)  Alternatively, the probability that one of the two numbers is a multiple of 4 is the probability that the first number is a multiple of 4 and the second number is not, which is $\frac{1}{10}$.  (Note that if both numbers are not multiples of 4, then the probability that one of them is a multiple of 4 is $\frac{1}{2}$.  There are no other cases to consi

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



示例 2:

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: Eric's sports car's specifications include a $500$ metric horsepower engine. Eric wants to know how many kilowatts of power his car's engine can generate. What result should Eric obtain if $1$ kilowatt is equivalent to $1.36$ horsepower? (You may use a calculator on this problem; answer to the nearest kilowatt.)

assistant
Here is the step-by-step solution:


模型预测答案:
 engine has a 500 horsepower engine, it has a power of $500 \cdot 1,360 = \boxed{680,000}$ kilowatts.  The answer is 680,000

真实答案:
We multiply the engine's power measured in horsepower, $500$, by the conversion factor $\frac{1\ \text{kW}}{1.36\ \text{hp}}$ to obtain $500\ \text{hp} \cdot \frac{1\ \text{kW}}{1.36\ \text{hp}} \approx \boxed{368}\ \text{kW}$.



Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



示例 3:

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: Your friend has over 200 stamps. He has a stamp book where he can put 30 stamps on each page. He has placed the stamps in his stamp book in such a way that the only page which is not completely filled with stamps is the last one. The last page has 5 stamps in it. What is the smallest number of stamps your friend can have?

assistant
Here is the step-by-step solution:


模型预测答案:
l the last page is to place 5 stamps on the first page and 4 on the second page.  This leaves 1 stamp on the third page, 2 on the fourth page, 3 on the fifth page, 4 on the sixth page, and so on.  The smallest number of stamps is 5.  The answer is $\boxed{5}$.  [asy]
draw((0,0)--(0,5)--(1,5)--(1,0)--(2,0)--(2,5)--(3,5)--(3,0)--(4,0)--(4,5)--(4,0));
dot((0,0));
dot((0,5));
dot((1,5));
dot((1,0));
dot((2,0));
dot((2,5));
dot((3,5));
dot((3,0));
dot((4,0));
dot((4,5));
dot((4,0));
[/as

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



示例 4:

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: How many numbers are in the list $$ 2.5, 5.5, 8.5, 11.5, \ldots, 80.5, 83.5? $$

assistant
Here is the step-by-step solution:


模型预测答案:
y 3. So, the list is $2.5, 5.5, 8.5, 11.5, \ldots, 80.5, 83.5$, which is a sequence of 30 numbers. Thus, the answer is $\boxed{30}$.  [asy]
unitsize(10mm);
draw((0,0)--(10,0),Arrow);
draw((10,0)--(10,10),Arrow);
draw((10,10)--(0,10),Arrow);
draw((0,10)--(0,0),Arrow);
draw((0,0)--(5,0),Arrow);
draw((5,0)--(5,5),Arrow);
draw((5,5)--(10,5),Arrow);
draw((10,5)--(10,10),Arrow);
draw((10,10)--(0,10),Arrow);
draw((0,10)--(0,0),Arrow);
draw((0,0)--(5,5),Arrow);
draw((5,5)--(

真实答案:
We can add $0.5$ to each member of the list, to make it easier to deal with: $$
3, 6, 9, 12, \ldots, 81, 84.
$$ Now if we divide by 3, we get $$
1, 2, 3, 4, \ldots, 27, 28,
$$ so there are $\boxed{28}$ numbers in the list.


示例 5:

问题:
system
You are a

In [38]:
from tqdm.auto import tqdm  # 导入tqdm

# 设置模型为评估模式
model.eval()

# 用于存储结果的列表
results = {
    'problem': [],
    'predicted_box': [],
    'true_box': [],
    'is_correct': []
}

print("\n=== 开始验证集完整评估 ===\n")

# 使用tqdm包装验证集
progress_bar = tqdm(
    test_dataset,
    total=len(test_dataset),
    desc="评估进度",
    ncols=500  # 进度条宽度
)

with torch.no_grad():
    for sample in progress_bar:
        # 准备输入
        prompt = sample['prompt']
        true_response = sample['response']

        # 对输入进行编码
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to(model.device)

        # 生成预测
        outputs = model.generate(
            **inputs,
            num_return_sequences=1,
            temperature=0.3,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            generation_config=generation_config
        )

        # 解码预测结果
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_answer = predicted_text[len(prompt):]

        # 提取boxed内容
        pred_box = extract_boxed_number(predicted_answer)
        true_box = extract_boxed_number(true_response)

        # 存储结果
        results['problem'].append(prompt.replace(DEFAULT_BOS_TOKEN, "").replace(DEFAULT_EOS_TOKEN, ""))
        results['predicted_box'].append(pred_box)
        results['true_box'].append(true_box)
        results['is_correct'].append(
            pred_box == true_box if pred_box is not None and true_box is not None else False
        )

        # 更新进度条描述以显示当前正确率
        correct_so_far = sum(results['is_correct'])
        current_total = len(results['is_correct'])
        progress_bar.set_postfix({
            'correct': f"{correct_so_far}/{current_total}",
            'accuracy': f"{correct_so_far/current_total:.4f}"
        })

# 计算总体统计信息
total_samples = len(results['is_correct'])
valid_samples = sum(1 for p, t in zip(results['predicted_box'], results['true_box'])
                   if p is not None and t is not None)
correct_samples = sum(results['is_correct'])

print("\n=== 评估结果统计 ===")
print(f"总样本数: {total_samples}")
print(f"有效样本数 (both boxes extracted): {valid_samples}")
print(f"正确样本数: {correct_samples}")
print(f"总体正确率: {correct_samples/total_samples:.4f}")
print(f"有效样本正确率: {correct_samples/valid_samples:.4f}")

# 创建详细的结果DataFrame
results_df = pd.DataFrame(results)

# 显示部分错误案例
print("\n=== 错误案例样本 ===")
error_cases = results_df[
    (results_df['predicted_box'].notna()) &
    (results_df['true_box'].notna()) &
    ~results_df['is_correct']
].head(5)

for _, row in error_cases.iterrows():
    print("\n问题:")
    print(row['problem'])
    print(f"预测boxed值: {row['predicted_box']}")
    print(f"真实boxed值: {row['true_box']}")
    print("-" * 80)

# 保存结果到CSV文件
results_df.to_csv('validation_results.csv', index=False)
print("\n结果已保存到 validation_results.csv")


=== 开始验证集完整评估 ===



评估进度:   0%|                                                                                                   …


=== 评估结果统计 ===
总样本数: 200
有效样本数 (both boxes extracted): 92
正确样本数: 20
总体正确率: 0.1000
有效样本正确率: 0.2174

=== 错误案例样本 ===

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: Eric's sports car's specifications include a $500$ metric horsepower engine. Eric wants to know how many kilowatts of power his car's engine can generate. What result should Eric obtain if $1$ kilowatt is equivalent to $1.36$ horsepower? (You may use a calculator on this problem; answer to the nearest kilowatt.)

assistant
Here is the step-by-step solution:

预测boxed值: 680
真实boxed值: 368
--------------------------------------------------------------------------------

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: Your friend has over 200 stamps. He has a stamp book where he can put 30 stamps on each page. He has placed the stamps in his stamp book in such a way that the onl

In [39]:
# 显示所有正确案例
print("\n=== 正确案例 ===")
correct_cases = results_df[
    (results_df['predicted_box'].notna()) &
    (results_df['true_box'].notna()) &
    results_df['is_correct']
]

for _, row in correct_cases.iterrows():
    print("\n问题:")
    print(row['problem'])
    print(f"预测boxed值: {row['predicted_box']}")
    print(f"真实boxed值: {row['true_box']}")
    print("-" * 80)

# 打印正确案例的统计信息
print(f"\n总共正确案例数: {len(correct_cases)}")


=== 正确案例 ===

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: How many numbers are in the list $$ 2.5, 5.5, 8.5, 11.5, \ldots, 80.5, 83.5? $$

assistant
Here is the step-by-step solution:

预测boxed值: 28
真实boxed值: 28
--------------------------------------------------------------------------------

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: For each positive integer $n$, let $n!$ denote the product $1\cdot 2\cdot 3\cdot\,\cdots\,\cdot (n-1)\cdot n$.

What is the remainder when $9!$ is divided by $10$?

assistant
Here is the step-by-step solution:

预测boxed值: 0
真实boxed值: 0
--------------------------------------------------------------------------------

问题:
system
You are a logical reasoning expert. Analyze the following problem and provide a detailed solution.

user
Problem: If $x$ and $y$ are real, and $x^2 + y^2 = 1,$ compute the 