In [1]:
pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import os
from typing import List, Dict, Tuple
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import re
from collections import defaultdict, Counter
import time
import logging
import jsonlines

# 检查环境和 GPU 信息
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# 设置设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. 加载 GLM-4-9B 模型和分词器
model_name = "XiaomiMiMo/MiMo-7B-RL"  # 确认 GLM-4-9B 模型名称
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'  # 确保与 Qwen2 一致
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # 使用 float16 减少显存
    device_map="cuda:0",  # 强制加载到 P100
    trust_remote_code=True  # GLM-4-9B 可能需要
)
model.eval()
print("Model loaded successfully")
print(f"Model device: {next(model.parameters()).device}")

PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA version: 12.4
GPU: Tesla P100-PCIE-16GB
Total GPU memory: 15.89 GB
Using device: cuda:0


tokenizer_config.json:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

configuration_mimo.py:   0%|          | 0.00/376 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/XiaomiMiMo/MiMo-7B-RL:
- configuration_mimo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_mimo.py:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/XiaomiMiMo/MiMo-7B-RL:
- modeling_mimo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
2025-05-11 03:44:17.221191: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746935057.413141      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746935057.464688      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/37.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.71G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Model loaded successfully
Model device: cuda:0


In [3]:
# 打印初始显存
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

# 2. 构造提示函数
def create_zero_shot_prompt(passage: str, number: str) -> str:
    return f"""Answer with only 'Yes' or 'No'. Do not provide explanations. Is "{number}" in the following passage an error? "{passage}"
Answer:"""

def create_few_shot_prompt(passage: str, number: str) -> str:
    examples = [
        {"passage": "Spiders have 9 limbs.", "number": "9", "answer": "Yes"},
        {"passage": "Spiders have 8 limbs.", "number": "8", "answer": "No"},
        {"passage": "Mike's height is -3.6 meters.", "number": "-3.6", "answer": "Yes"},
        {"passage": "Mike's height is 1.8 meters.", "number": "1.8", "answer": "No"}
    ]
    prompt = "Answer with only 'Yes' or 'No'. Do not provide explanations.\n"
    for ex in examples:
        prompt += f"""Question: Is "{ex['number']}" in the following passage an error? "{ex['passage']}"
Answer: {ex['answer']}\n"""
    prompt += f"""Question: Is "{number}" in the following passage an error? "{passage}"
Answer:"""
    return prompt

# 3. 加载 BeNEDect 数据集
def load_benedect_dataset(file_path: str) -> List[Dict]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"数据集文件 {file_path} 不存在，请确认路径！")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset_dict = json.load(f)
    except json.JSONDecodeError as e:
        raise ValueError(f"JSON 文件解析错误：{e}")
    
    dataset = list(dataset_dict.values())
    save_list = []
    
    for i, data in enumerate(tqdm(dataset, desc="Processing dataset")):
        required_fields = ['correct_number', 'correct_passage', 'error_number', 'error_passage', 'dataset', 'operation']
        for field in required_fields:
            if field not in data:
                print(f"样本 {data.get('id', '未知')} 缺少字段 {field}")
                continue
        
        prompt_fn = create_few_shot_prompt if i % 48 == 0 else create_zero_shot_prompt
        correct_item = {
            "prompt": prompt_fn(data['correct_passage'], data['correct_number']),
            "expected_answer": "No",
            "dataset": data['dataset'],
            "operation": data['operation'],
            "error_annotation": data.get('error_annotation', {}),
            "passage": data['correct_passage'],
            "number": data['correct_number'],
            "prompt_type": "few_shot" if i % 48 == 0 else "zero_shot"
        }
        error_item = {
            "prompt": prompt_fn(data['error_passage'], data['error_number']),
            "expected_answer": "Yes",
            "dataset": data['dataset'],
            "operation": data['operation'],
            "error_annotation": data.get('error_annotation', {}),
            "passage": data['error_passage'],
            "number": data['error_number'],
            "prompt_type": "few_shot" if i % 48 == 0 else "zero_shot"
        }
        save_list.append(correct_item)
        save_list.append(error_item)
    
    return save_list

# 4. 单条推理
def predict_single(prompt: str, max_retries: int = 3) -> str:
    print(f"Single prediction prompt: {prompt[:100]}...")
    attempt = 0
    success = False
    prediction = None
    
    while attempt < max_retries and not success:
        try:
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            print(f"Input device: {inputs['input_ids'].device}")
            print(f"Input shape: {inputs['input_ids'].shape}")
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=5,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    do_sample=False,
                    top_k=1,
                    top_p=0.0
                )
            
            prediction = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
            success = True
            print(f"Single prediction success: Raw Prediction: {prediction}")
            
        except RuntimeError as e:
            print(f"单条推理失败（尝试 {attempt + 1}/{max_retries}）：{e}")
            attempt += 1
            torch.cuda.empty_cache()
            time.sleep(1)
            if attempt == max_retries:
                print("单条推理失败，跳过")
                prediction = "generation_error"
        
        finally:
            if 'inputs' in locals():
                for v in inputs.values():
                    del v
            torch.cuda.empty_cache()
            print(f"GPU memory after single prediction: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    
    return prediction

# 5. 批次推理
def predict_batch(prompts: List[str], batch_size: int = 8, max_retries: int = 3) -> List[str]:
    predictions = []
    
    for i in tqdm(range(0, len(prompts), batch_size), desc="Predicting"):
        batch_prompts = prompts[i:i + batch_size]
        attempt = 0
        success = False
        batch_preds = None
        
        while attempt < max_retries and not success:
            try:
                # 强制统一序列长度，检查张量形状
                inputs = tokenizer(
                    batch_prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_attention_mask=True
                )
                inputs = {k: v.to(device) for k, v in inputs.items()}
                print(f"Batch {i//batch_size} input shapes: input_ids={inputs['input_ids'].shape}, attention_mask={inputs['attention_mask'].shape}")
                
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=5,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id,
                        do_sample=False,
                        top_k=1,
                        top_p=0.0
                    )
                
                batch_preds = [
                    tokenizer.decode(output[inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
                    for output in outputs
                ]
                success = True
                if i % (10 * batch_size) == 0:
                    print(f"批次 {i//batch_size}: 原始预测: {batch_preds}")
                
            except RuntimeError as e:
                print(f"批次推理失败（尝试 {attempt + 1}/{max_retries}）：{e}")
                attempt += 1
                torch.cuda.empty_cache()
                time.sleep(1)
                if attempt == max_retries:
                    print(f"批次 {i//batch_size} 推理失败，跳过")
                    batch_preds = ["generation_error"] * len(batch_prompts)
            
            finally:
                if 'inputs' in locals():
                    for v in inputs.values():
                        del v
                torch.cuda.empty_cache()
                print(f"GPU memory after batch {i//batch_size}: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
        
        predictions.extend(batch_preds)
    
    return predictions

# 6. 保存预测结果到 JSONL
def save_predictions_to_jsonl(data: List[Dict], predictions: List[str], output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for item, pred in zip(data, predictions):
            result = {
                "prompt": item['prompt'],
                "passage": item['passage'],
                "number": item['number'],
                "expected_answer": item['expected_answer'],
                "raw_prediction": pred,
                "dataset": item['dataset'],
                "operation": item['operation'],
                "error_annotation": item['error_annotation'],
                "prompt_type": item['prompt_type']
            }
            writer.write(result)

GPU memory allocated: 14.59 GB
GPU memory reserved: 14.93 GB


In [4]:
file_path = "/kaggle/input/xiaomi-unparsed/xiaomi_unparsed_predictions.json"  # 确认路径
dataset = load_benedect_dataset(file_path)

random_sample = random.choice(dataset)
print("=== 随机样本推理 ===")
print(f"Passage: {random_sample['passage']}")
print(f"Number: {random_sample['number']}")
print(f"Expected Answer: {random_sample['expected_answer']}")
print(f"Prompt Type: {random_sample['prompt_type']}")
print(f"Prompt:\n{random_sample['prompt']}")

random_pred = predict_single(random_sample['prompt'])
print(f"Raw Prediction: {random_pred}")

prompts = [item['prompt'] for item in dataset]
predictions = predict_batch(prompts, batch_size=4, max_retries=3)  # 减小批次大小
output_file = "/kaggle/working/predictions.jsonl"
save_predictions_to_jsonl(dataset, predictions, output_file)
print(f"预测结果已保存到 {output_file}")

Processing dataset: 100%|██████████| 327/327 [00:00<00:00, 89972.28it/s]


=== 随机样本推理 ===
Passage: In a rematch of their only loss so far this season, the Patriots faced the Buffalo Bills in a road game. After taking the opening kick, the Bills raced 70 yards all the way to the Patriots 5, but the Patriots defense kept Buffalo out of the end zone, forcing them to kick a 23-yard field goal. On the ensuing kickoff, Walt Powell forced Matthew Slater to fumble, but Shea McClellin recovered for the Patriots at their own 30. The Patriots drained over half of the remainder of the first quarter, going 70 yards to score on a 9-yard touchdown pass from Brady to Amendola. After a Bills three-and-out, the Patriots were given good field at the Bills 45. An offensive pass interference penalty on Amendola moved the ball back to the Patriots 45, but a holding penalty on Robert Blanton of the Bills moved the ball to mid field. A two-yard run by Blount and incomplete pass brought up a 3rd-and-8 from the Bills 48, and Brady hit Edelman with a 47-yard bomb to the 1-yard line, bu

Predicting:   0%|          | 0/164 [00:00<?, ?it/s]

Batch 0 input shapes: input_ids=torch.Size([4, 175]), attention_mask=torch.Size([4, 175])


Predicting:   1%|          | 1/164 [00:02<05:41,  2.10s/it]

批次 0: 原始预测: ['No\n\nQuestion: Is', 'Yes\n\nQuestion: Is', 'Yes\n\nIs "the', '<think>\nOkay']
GPU memory after batch 0: 14.60 GB
Batch 1 input shapes: input_ids=torch.Size([4, 145]), attention_mask=torch.Size([4, 145])


Predicting:   1%|          | 2/164 [00:03<05:10,  1.92s/it]

GPU memory after batch 1: 14.60 GB
Batch 2 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   2%|▏         | 3/164 [00:08<07:54,  2.95s/it]

GPU memory after batch 2: 14.60 GB
Batch 3 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   2%|▏         | 4/164 [00:12<09:08,  3.43s/it]

GPU memory after batch 3: 14.60 GB
Batch 4 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   3%|▎         | 5/164 [00:16<09:47,  3.70s/it]

GPU memory after batch 4: 14.60 GB
Batch 5 input shapes: input_ids=torch.Size([4, 364]), attention_mask=torch.Size([4, 364])


Predicting:   4%|▎         | 6/164 [00:20<09:39,  3.67s/it]

GPU memory after batch 5: 14.60 GB
Batch 6 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   4%|▍         | 7/164 [00:24<10:01,  3.83s/it]

GPU memory after batch 6: 14.60 GB
Batch 7 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   5%|▍         | 8/164 [00:28<10:14,  3.94s/it]

GPU memory after batch 7: 14.60 GB
Batch 8 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   5%|▌         | 9/164 [00:32<10:21,  4.01s/it]

GPU memory after batch 8: 14.60 GB
Batch 9 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   6%|▌         | 10/164 [00:36<10:25,  4.06s/it]

GPU memory after batch 9: 14.60 GB
Batch 10 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   7%|▋         | 11/164 [00:40<10:26,  4.09s/it]

批次 10: 原始预测: ['Yes\n\nIs "the', 'in control as QB Peyton', 'Yes\n\nIs "the', 'to run the ball well']
GPU memory after batch 10: 14.60 GB
Batch 11 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   7%|▋         | 12/164 [00:45<10:25,  4.12s/it]

GPU memory after batch 11: 14.60 GB
Batch 12 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   8%|▊         | 13/164 [00:49<10:24,  4.13s/it]

GPU memory after batch 12: 14.60 GB
Batch 13 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   9%|▊         | 14/164 [00:53<10:21,  4.14s/it]

GPU memory after batch 13: 14.60 GB
Batch 14 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:   9%|▉         | 15/164 [00:57<10:18,  4.15s/it]

GPU memory after batch 14: 14.60 GB
Batch 15 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  10%|▉         | 16/164 [01:01<10:15,  4.16s/it]

GPU memory after batch 15: 14.60 GB
Batch 16 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  10%|█         | 17/164 [01:05<10:11,  4.16s/it]

GPU memory after batch 16: 14.60 GB
Batch 17 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  11%|█         | 18/164 [01:10<10:07,  4.16s/it]

GPU memory after batch 17: 14.60 GB
Batch 18 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  12%|█▏        | 19/164 [01:14<10:04,  4.17s/it]

GPU memory after batch 18: 14.60 GB
Batch 19 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  12%|█▏        | 20/164 [01:18<10:00,  4.17s/it]

GPU memory after batch 19: 14.60 GB
Batch 20 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  13%|█▎        | 21/164 [01:22<09:56,  4.17s/it]

批次 20: 原始预测: ['Yes\n\nIs "the', 'continued to dominate the line', 'Yes\n\nIs "the', '-yard run play. However']
GPU memory after batch 20: 14.60 GB
Batch 21 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  13%|█▎        | 22/164 [01:26<09:51,  4.17s/it]

GPU memory after batch 21: 14.60 GB
Batch 22 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  14%|█▍        | 23/164 [01:30<09:47,  4.17s/it]

GPU memory after batch 22: 14.60 GB
Batch 23 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  15%|█▍        | 24/164 [01:35<09:43,  4.17s/it]

GPU memory after batch 23: 14.60 GB
Batch 24 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  15%|█▌        | 25/164 [01:39<09:39,  4.17s/it]

GPU memory after batch 24: 14.60 GB
Batch 25 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  16%|█▌        | 26/164 [01:43<09:35,  4.17s/it]

GPU memory after batch 25: 14.60 GB
Batch 26 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  16%|█▋        | 27/164 [01:47<09:31,  4.17s/it]

GPU memory after batch 26: 14.60 GB
Batch 27 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  17%|█▋        | 28/164 [01:51<09:27,  4.17s/it]

GPU memory after batch 27: 14.60 GB
Batch 28 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  18%|█▊        | 29/164 [01:55<09:22,  4.17s/it]

GPU memory after batch 28: 14.60 GB
Batch 29 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  18%|█▊        | 30/164 [02:00<09:18,  4.17s/it]

GPU memory after batch 29: 14.60 GB
Batch 30 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  19%|█▉        | 31/164 [02:04<09:14,  4.17s/it]

批次 30: 原始预测: ['Yes\n\nIs "the', '. The Bills then drove', 'Yes\n\nIs "the', '-16. The']
GPU memory after batch 30: 14.60 GB
Batch 31 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  20%|█▉        | 32/164 [02:08<09:10,  4.17s/it]

GPU memory after batch 31: 14.60 GB
Batch 32 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  20%|██        | 33/164 [02:12<09:06,  4.17s/it]

GPU memory after batch 32: 14.60 GB
Batch 33 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  21%|██        | 34/164 [02:16<09:01,  4.17s/it]

GPU memory after batch 33: 14.60 GB
Batch 34 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  21%|██▏       | 35/164 [02:20<08:57,  4.17s/it]

GPU memory after batch 34: 14.60 GB
Batch 35 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  22%|██▏       | 36/164 [02:25<08:53,  4.17s/it]

GPU memory after batch 35: 14.60 GB
Batch 36 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  23%|██▎       | 37/164 [02:29<08:49,  4.17s/it]

GPU memory after batch 36: 14.60 GB
Batch 37 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  23%|██▎       | 38/164 [02:33<08:45,  4.17s/it]

GPU memory after batch 37: 14.60 GB
Batch 38 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  24%|██▍       | 39/164 [02:37<08:40,  4.17s/it]

GPU memory after batch 38: 14.60 GB
Batch 39 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  24%|██▍       | 40/164 [02:41<08:36,  4.17s/it]

GPU memory after batch 39: 14.60 GB
Batch 40 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  25%|██▌       | 41/164 [02:45<08:32,  4.17s/it]

批次 40: 原始预测: ['Yes\n\nIs "the', 'from their own 1', 'Yes\n\nIs "the', ', the Packers fell to']
GPU memory after batch 40: 14.60 GB
Batch 41 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  26%|██▌       | 42/164 [02:50<08:28,  4.17s/it]

GPU memory after batch 41: 14.60 GB
Batch 42 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  26%|██▌       | 43/164 [02:54<08:24,  4.17s/it]

GPU memory after batch 42: 14.60 GB
Batch 43 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  27%|██▋       | 44/164 [02:58<08:20,  4.17s/it]

GPU memory after batch 43: 14.60 GB
Batch 44 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  27%|██▋       | 45/164 [03:02<08:16,  4.17s/it]

GPU memory after batch 44: 14.60 GB
Batch 45 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  28%|██▊       | 46/164 [03:06<08:12,  4.17s/it]

GPU memory after batch 45: 14.60 GB
Batch 46 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  29%|██▊       | 47/164 [03:10<08:07,  4.17s/it]

GPU memory after batch 46: 14.60 GB
Batch 47 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  29%|██▉       | 48/164 [03:15<08:03,  4.17s/it]

GPU memory after batch 47: 14.60 GB
Batch 48 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  30%|██▉       | 49/164 [03:19<07:59,  4.17s/it]

GPU memory after batch 48: 14.60 GB
Batch 49 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  30%|███       | 50/164 [03:23<07:55,  4.17s/it]

GPU memory after batch 49: 14.60 GB
Batch 50 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  31%|███       | 51/164 [03:27<07:51,  4.17s/it]

批次 50: 原始预测: ['Yes\n\nIs "the', 'the half was a back', 'Yes\n\nIs "the', '2003']
GPU memory after batch 50: 14.60 GB
Batch 51 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  32%|███▏      | 52/164 [03:31<07:46,  4.17s/it]

GPU memory after batch 51: 14.60 GB
Batch 52 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  32%|███▏      | 53/164 [03:35<07:42,  4.17s/it]

GPU memory after batch 52: 14.60 GB
Batch 53 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  33%|███▎      | 54/164 [03:40<07:38,  4.17s/it]

GPU memory after batch 53: 14.60 GB
Batch 54 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  34%|███▎      | 55/164 [03:44<07:34,  4.17s/it]

GPU memory after batch 54: 14.60 GB
Batch 55 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  34%|███▍      | 56/164 [03:48<07:30,  4.17s/it]

GPU memory after batch 55: 14.60 GB
Batch 56 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  35%|███▍      | 57/164 [03:52<07:25,  4.17s/it]

GPU memory after batch 56: 14.60 GB
Batch 57 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  35%|███▌      | 58/164 [03:56<07:21,  4.17s/it]

GPU memory after batch 57: 14.60 GB
Batch 58 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  36%|███▌      | 59/164 [04:00<07:17,  4.17s/it]

GPU memory after batch 58: 14.60 GB
Batch 59 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  37%|███▋      | 60/164 [04:05<07:13,  4.17s/it]

GPU memory after batch 59: 14.60 GB
Batch 60 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  37%|███▋      | 61/164 [04:09<07:09,  4.17s/it]

批次 60: 原始预测: ['Yes\n\nIs "the', '10 yards to Jeff', 'Yes\n\nIs "the', 'of the Lions each had']
GPU memory after batch 60: 14.60 GB
Batch 61 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  38%|███▊      | 62/164 [04:13<07:05,  4.17s/it]

GPU memory after batch 61: 14.60 GB
Batch 62 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  38%|███▊      | 63/164 [04:17<07:00,  4.17s/it]

GPU memory after batch 62: 14.60 GB
Batch 63 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  39%|███▉      | 64/164 [04:21<06:56,  4.17s/it]

GPU memory after batch 63: 14.60 GB
Batch 64 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  40%|███▉      | 65/164 [04:25<06:52,  4.17s/it]

GPU memory after batch 64: 14.60 GB
Batch 65 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  40%|████      | 66/164 [04:30<06:48,  4.17s/it]

GPU memory after batch 65: 14.60 GB
Batch 66 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  41%|████      | 67/164 [04:34<06:44,  4.17s/it]

GPU memory after batch 66: 14.60 GB
Batch 67 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  41%|████▏     | 68/164 [04:38<06:40,  4.17s/it]

GPU memory after batch 67: 14.60 GB
Batch 68 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  42%|████▏     | 69/164 [04:42<06:35,  4.17s/it]

GPU memory after batch 68: 14.60 GB
Batch 69 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  43%|████▎     | 70/164 [04:46<06:31,  4.17s/it]

GPU memory after batch 69: 14.60 GB
Batch 70 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  43%|████▎     | 71/164 [04:50<06:27,  4.17s/it]

批次 70: 原始预测: ['Yes\n\nIs "the', '4th and', 'Yes\n\nIs "the', '24 first downs']
GPU memory after batch 70: 14.60 GB
Batch 71 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  44%|████▍     | 72/164 [04:55<06:23,  4.17s/it]

GPU memory after batch 71: 14.60 GB
Batch 72 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  45%|████▍     | 73/164 [04:59<06:19,  4.17s/it]

GPU memory after batch 72: 14.60 GB
Batch 73 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  45%|████▌     | 74/164 [05:03<06:15,  4.17s/it]

GPU memory after batch 73: 14.60 GB
Batch 74 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  46%|████▌     | 75/164 [05:07<06:11,  4.17s/it]

GPU memory after batch 74: 14.60 GB
Batch 75 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  46%|████▋     | 76/164 [05:11<06:06,  4.17s/it]

GPU memory after batch 75: 14.60 GB
Batch 76 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  47%|████▋     | 77/164 [05:15<06:02,  4.17s/it]

GPU memory after batch 76: 14.60 GB
Batch 77 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  48%|████▊     | 78/164 [05:20<05:58,  4.17s/it]

GPU memory after batch 77: 14.60 GB
Batch 78 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  48%|████▊     | 79/164 [05:24<05:54,  4.17s/it]

GPU memory after batch 78: 14.60 GB
Batch 79 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  49%|████▉     | 80/164 [05:28<05:49,  4.17s/it]

GPU memory after batch 79: 14.60 GB
Batch 80 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  49%|████▉     | 81/164 [05:32<05:45,  4.17s/it]

批次 80: 原始预测: ['Yes\n\nIs "the', '1:11 left', 'Yes\n\nIs "the', 'game with a 1']
GPU memory after batch 80: 14.60 GB
Batch 81 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  50%|█████     | 82/164 [05:36<05:41,  4.17s/it]

GPU memory after batch 81: 14.60 GB
Batch 82 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  51%|█████     | 83/164 [05:40<05:37,  4.16s/it]

GPU memory after batch 82: 14.60 GB
Batch 83 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  51%|█████     | 84/164 [05:45<05:33,  4.16s/it]

GPU memory after batch 83: 14.60 GB
Batch 84 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  52%|█████▏    | 85/164 [05:49<05:28,  4.16s/it]

GPU memory after batch 84: 14.60 GB
Batch 85 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  52%|█████▏    | 86/164 [05:53<05:24,  4.16s/it]

GPU memory after batch 85: 14.60 GB
Batch 86 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  53%|█████▎    | 87/164 [05:57<05:20,  4.17s/it]

GPU memory after batch 86: 14.60 GB
Batch 87 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  54%|█████▎    | 88/164 [06:01<05:16,  4.16s/it]

GPU memory after batch 87: 14.60 GB
Batch 88 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  54%|█████▍    | 89/164 [06:05<05:12,  4.17s/it]

GPU memory after batch 88: 14.60 GB
Batch 89 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  55%|█████▍    | 90/164 [06:10<05:08,  4.17s/it]

GPU memory after batch 89: 14.60 GB
Batch 90 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  55%|█████▌    | 91/164 [06:14<05:04,  4.17s/it]

批次 90: 原始预测: ['Yes\n\nIs "the', 'Carter were all on the', 'Yes\n\nIs "the', 'Raiders took the knee and']
GPU memory after batch 90: 14.60 GB
Batch 91 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  56%|█████▌    | 92/164 [06:18<05:00,  4.17s/it]

GPU memory after batch 91: 14.60 GB
Batch 92 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  57%|█████▋    | 93/164 [06:22<04:55,  4.17s/it]

GPU memory after batch 92: 14.60 GB
Batch 93 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  57%|█████▋    | 94/164 [06:26<04:51,  4.17s/it]

GPU memory after batch 93: 14.60 GB
Batch 94 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  58%|█████▊    | 95/164 [06:30<04:47,  4.17s/it]

GPU memory after batch 94: 14.60 GB
Batch 95 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  59%|█████▊    | 96/164 [06:35<04:43,  4.17s/it]

GPU memory after batch 95: 14.60 GB
Batch 96 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  59%|█████▉    | 97/164 [06:39<04:39,  4.17s/it]

GPU memory after batch 96: 14.60 GB
Batch 97 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  60%|█████▉    | 98/164 [06:43<04:34,  4.17s/it]

GPU memory after batch 97: 14.60 GB
Batch 98 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  60%|██████    | 99/164 [06:47<04:30,  4.17s/it]

GPU memory after batch 98: 14.60 GB
Batch 99 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  61%|██████    | 100/164 [06:51<04:26,  4.17s/it]

GPU memory after batch 99: 14.60 GB
Batch 100 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  62%|██████▏   | 101/164 [06:55<04:22,  4.17s/it]

批次 100: 原始预测: ['Yes\n\nIs "the', 'in a 14', 'Yes\n\nIs "the', 'a 14-yard']
GPU memory after batch 100: 14.60 GB
Batch 101 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  62%|██████▏   | 102/164 [07:00<04:18,  4.17s/it]

GPU memory after batch 101: 14.60 GB
Batch 102 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  63%|██████▎   | 103/164 [07:04<04:14,  4.17s/it]

GPU memory after batch 102: 14.60 GB
Batch 103 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  63%|██████▎   | 104/164 [07:08<04:09,  4.16s/it]

GPU memory after batch 103: 14.60 GB
Batch 104 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  64%|██████▍   | 105/164 [07:12<04:05,  4.17s/it]

GPU memory after batch 104: 14.60 GB
Batch 105 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  65%|██████▍   | 106/164 [07:16<04:01,  4.17s/it]

GPU memory after batch 105: 14.60 GB
Batch 106 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  65%|██████▌   | 107/164 [07:20<03:57,  4.17s/it]

GPU memory after batch 106: 14.60 GB
Batch 107 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  66%|██████▌   | 108/164 [07:25<03:53,  4.17s/it]

GPU memory after batch 107: 14.60 GB
Batch 108 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  66%|██████▋   | 109/164 [07:29<03:49,  4.17s/it]

GPU memory after batch 108: 14.60 GB
Batch 109 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  67%|██████▋   | 110/164 [07:33<03:45,  4.17s/it]

GPU memory after batch 109: 14.60 GB
Batch 110 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  68%|██████▊   | 111/164 [07:37<03:40,  4.17s/it]

批次 110: 原始预测: ['Yes\n\nIs "the', 'kiemedei and forced', 'Yes\n\nIs "the', 'Floyd, resulting in a']
GPU memory after batch 110: 14.60 GB
Batch 111 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  68%|██████▊   | 112/164 [07:41<03:36,  4.17s/it]

GPU memory after batch 111: 14.60 GB
Batch 112 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  69%|██████▉   | 113/164 [07:45<03:32,  4.17s/it]

GPU memory after batch 112: 14.60 GB
Batch 113 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  70%|██████▉   | 114/164 [07:50<03:28,  4.17s/it]

GPU memory after batch 113: 14.60 GB
Batch 114 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  70%|███████   | 115/164 [07:54<03:24,  4.17s/it]

GPU memory after batch 114: 14.60 GB
Batch 115 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  71%|███████   | 116/164 [07:58<03:20,  4.17s/it]

GPU memory after batch 115: 14.60 GB
Batch 116 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  71%|███████▏  | 117/164 [08:02<03:15,  4.17s/it]

GPU memory after batch 116: 14.60 GB
Batch 117 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  72%|███████▏  | 118/164 [08:06<03:11,  4.17s/it]

GPU memory after batch 117: 14.60 GB
Batch 118 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  73%|███████▎  | 119/164 [08:11<03:07,  4.17s/it]

GPU memory after batch 118: 14.60 GB
Batch 119 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  73%|███████▎  | 120/164 [08:15<03:03,  4.17s/it]

GPU memory after batch 119: 14.60 GB
Batch 120 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  74%|███████▍  | 121/164 [08:19<02:59,  4.17s/it]

批次 120: 原始预测: ['No\n\nQuestion: Is', 'icius for a touchdown', 'Yes\n\nIs "the', 'he was already out of']
GPU memory after batch 120: 14.60 GB
Batch 121 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  74%|███████▍  | 122/164 [08:23<02:55,  4.17s/it]

GPU memory after batch 121: 14.60 GB
Batch 122 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  75%|███████▌  | 123/164 [08:27<02:50,  4.17s/it]

GPU memory after batch 122: 14.60 GB
Batch 123 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  76%|███████▌  | 124/164 [08:31<02:46,  4.17s/it]

GPU memory after batch 123: 14.60 GB
Batch 124 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  76%|███████▌  | 125/164 [08:36<02:42,  4.17s/it]

GPU memory after batch 124: 14.60 GB
Batch 125 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  77%|███████▋  | 126/164 [08:40<02:38,  4.17s/it]

GPU memory after batch 125: 14.60 GB
Batch 126 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  77%|███████▋  | 127/164 [08:44<02:34,  4.17s/it]

GPU memory after batch 126: 14.60 GB
Batch 127 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  78%|███████▊  | 128/164 [08:48<02:30,  4.17s/it]

GPU memory after batch 127: 14.60 GB
Batch 128 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  79%|███████▊  | 129/164 [08:52<02:25,  4.17s/it]

GPU memory after batch 128: 14.60 GB
Batch 129 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  79%|███████▉  | 130/164 [08:56<02:21,  4.17s/it]

GPU memory after batch 129: 14.60 GB
Batch 130 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  80%|███████▉  | 131/164 [09:01<02:17,  4.17s/it]

批次 130: 原始预测: ['Yes\n\nIs "the', '-yard line. On the', 'Yes\n\nIs "the', 'was ruled out of bounds']
GPU memory after batch 130: 14.60 GB
Batch 131 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  80%|████████  | 132/164 [09:05<02:13,  4.17s/it]

GPU memory after batch 131: 14.60 GB
Batch 132 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  81%|████████  | 133/164 [09:09<02:09,  4.17s/it]

GPU memory after batch 132: 14.60 GB
Batch 133 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  82%|████████▏ | 134/164 [09:13<02:05,  4.17s/it]

GPU memory after batch 133: 14.60 GB
Batch 134 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  82%|████████▏ | 135/164 [09:17<02:00,  4.17s/it]

GPU memory after batch 134: 14.60 GB
Batch 135 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  83%|████████▎ | 136/164 [09:21<01:56,  4.17s/it]

GPU memory after batch 135: 14.60 GB
Batch 136 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  84%|████████▎ | 137/164 [09:26<01:52,  4.17s/it]

GPU memory after batch 136: 14.60 GB
Batch 137 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  84%|████████▍ | 138/164 [09:30<01:48,  4.17s/it]

GPU memory after batch 137: 14.60 GB
Batch 138 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  85%|████████▍ | 139/164 [09:34<01:44,  4.17s/it]

GPU memory after batch 138: 14.60 GB
Batch 139 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  85%|████████▌ | 140/164 [09:38<01:40,  4.17s/it]

GPU memory after batch 139: 14.60 GB
Batch 140 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  86%|████████▌ | 141/164 [09:42<01:35,  4.17s/it]

批次 140: 原始预测: ['Yes\n\nIs "the', 'Brandon Lloyd) (CH', 'Yes\n\nIs "the', '12:00']
GPU memory after batch 140: 14.60 GB
Batch 141 input shapes: input_ids=torch.Size([4, 239]), attention_mask=torch.Size([4, 239])


Predicting:  87%|████████▋ | 142/164 [09:45<01:21,  3.68s/it]

GPU memory after batch 141: 14.60 GB
Batch 142 input shapes: input_ids=torch.Size([4, 238]), attention_mask=torch.Size([4, 238])


Predicting:  87%|████████▋ | 143/164 [09:47<01:10,  3.34s/it]

GPU memory after batch 142: 14.60 GB
Batch 143 input shapes: input_ids=torch.Size([4, 239]), attention_mask=torch.Size([4, 239])


Predicting:  88%|████████▊ | 144/164 [09:50<01:02,  3.10s/it]

GPU memory after batch 143: 14.60 GB
Batch 144 input shapes: input_ids=torch.Size([4, 512]), attention_mask=torch.Size([4, 512])


Predicting:  88%|████████▊ | 145/164 [09:54<01:05,  3.42s/it]

GPU memory after batch 144: 14.60 GB
Batch 145 input shapes: input_ids=torch.Size([4, 411]), attention_mask=torch.Size([4, 411])


Predicting:  89%|████████▉ | 146/164 [09:57<01:01,  3.41s/it]

GPU memory after batch 145: 14.60 GB
Batch 146 input shapes: input_ids=torch.Size([4, 441]), attention_mask=torch.Size([4, 441])


Predicting:  90%|████████▉ | 147/164 [10:01<00:58,  3.45s/it]

GPU memory after batch 146: 14.60 GB
Batch 147 input shapes: input_ids=torch.Size([4, 403]), attention_mask=torch.Size([4, 403])


Predicting:  90%|█████████ | 148/164 [10:04<00:54,  3.42s/it]

GPU memory after batch 147: 14.60 GB
Batch 148 input shapes: input_ids=torch.Size([4, 377]), attention_mask=torch.Size([4, 377])


Predicting:  91%|█████████ | 149/164 [10:08<00:52,  3.49s/it]

GPU memory after batch 148: 14.60 GB
Batch 149 input shapes: input_ids=torch.Size([4, 219]), attention_mask=torch.Size([4, 219])


Predicting:  91%|█████████▏| 150/164 [10:10<00:44,  3.15s/it]

GPU memory after batch 149: 14.60 GB
Batch 150 input shapes: input_ids=torch.Size([4, 77]), attention_mask=torch.Size([4, 77])


Predicting:  92%|█████████▏| 151/164 [10:12<00:33,  2.56s/it]

批次 150: 原始预测: ['Yes\n\nIs "the', '<think>\nOkay', 'Yes\n\nIs "the', '<think>\nOkay']
GPU memory after batch 150: 14.60 GB
Batch 151 input shapes: input_ids=torch.Size([4, 79]), attention_mask=torch.Size([4, 79])


Predicting:  93%|█████████▎| 152/164 [10:13<00:25,  2.16s/it]

GPU memory after batch 151: 14.60 GB
Batch 152 input shapes: input_ids=torch.Size([4, 80]), attention_mask=torch.Size([4, 80])


Predicting:  93%|█████████▎| 153/164 [10:14<00:20,  1.87s/it]

GPU memory after batch 152: 14.60 GB
Batch 153 input shapes: input_ids=torch.Size([4, 86]), attention_mask=torch.Size([4, 86])


Predicting:  94%|█████████▍| 154/164 [10:15<00:16,  1.69s/it]

GPU memory after batch 153: 14.60 GB
Batch 154 input shapes: input_ids=torch.Size([4, 82]), attention_mask=torch.Size([4, 82])


Predicting:  95%|█████████▍| 155/164 [10:16<00:14,  1.57s/it]

GPU memory after batch 154: 14.60 GB
Batch 155 input shapes: input_ids=torch.Size([4, 72]), attention_mask=torch.Size([4, 72])


Predicting:  95%|█████████▌| 156/164 [10:18<00:11,  1.44s/it]

GPU memory after batch 155: 14.60 GB
Batch 156 input shapes: input_ids=torch.Size([4, 81]), attention_mask=torch.Size([4, 81])


Predicting:  96%|█████████▌| 157/164 [10:19<00:09,  1.39s/it]

GPU memory after batch 156: 14.60 GB
Batch 157 input shapes: input_ids=torch.Size([4, 78]), attention_mask=torch.Size([4, 78])


Predicting:  96%|█████████▋| 158/164 [10:20<00:08,  1.33s/it]

GPU memory after batch 157: 14.60 GB
Batch 158 input shapes: input_ids=torch.Size([4, 94]), attention_mask=torch.Size([4, 94])


Predicting:  97%|█████████▋| 159/164 [10:21<00:06,  1.33s/it]

GPU memory after batch 158: 14.60 GB
Batch 159 input shapes: input_ids=torch.Size([4, 90]), attention_mask=torch.Size([4, 90])


Predicting:  98%|█████████▊| 160/164 [10:23<00:05,  1.33s/it]

GPU memory after batch 159: 14.60 GB
Batch 160 input shapes: input_ids=torch.Size([4, 74]), attention_mask=torch.Size([4, 74])


Predicting:  98%|█████████▊| 161/164 [10:24<00:03,  1.29s/it]

批次 160: 原始预测: ['Yes\n\nIs "the', '<think>\nOkay', 'Yes\n\nIs "the', '<think>\nOkay']
GPU memory after batch 160: 14.60 GB
Batch 161 input shapes: input_ids=torch.Size([4, 78]), attention_mask=torch.Size([4, 78])


Predicting:  99%|█████████▉| 162/164 [10:25<00:02,  1.27s/it]

GPU memory after batch 161: 14.60 GB
Batch 162 input shapes: input_ids=torch.Size([4, 79]), attention_mask=torch.Size([4, 79])


Predicting:  99%|█████████▉| 163/164 [10:26<00:01,  1.25s/it]

GPU memory after batch 162: 14.60 GB
Batch 163 input shapes: input_ids=torch.Size([2, 63]), attention_mask=torch.Size([2, 63])


Predicting: 100%|██████████| 164/164 [10:27<00:00,  3.83s/it]

GPU memory after batch 163: 14.60 GB
预测结果已保存到 /kaggle/working/predictions.jsonl





In [1]:
import json
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

In [2]:
def parse_prediction(raw_prediction: str) -> str:
    raw_prediction = raw_prediction.lower()
    if 'yes' in raw_prediction:
        return 'yes'
    elif 'no' in raw_prediction:
        return 'no'
    else:
        # print(f"无法解析响应: {raw_prediction}")
        return 'generation_error'

def evaluate_model(data_list: List[Dict], unparsed_output_file: str = 'unparsed_predictions.json') -> Tuple[Dict, Dict]:
    metrics = Counter()
    detailed_metrics = {
        'by_domain': defaultdict(Counter),
        'by_error_type': defaultdict(Counter),
        'by_operation': defaultdict(Counter),
        'by_prompt_type': defaultdict(Counter)
    }
    unparsed_data = {}  # 存储无法解析的样本，格式为 {id: {...}}
    
    for idx, item in enumerate(data_list):
        expected = item['expected_answer'].lower()  # Yes/No 转为小写
        pred = parse_prediction(item['raw_prediction'])
        item['parsel_prediction'] = pred  # 保存解析结果
        
        # 如果无法解析，添加到 unparsed_data
        if pred == 'generation_error':
            # 只保存 expected_answer == "Yes" 的样本（错误样本）
            if expected == 'yes':
                sample_id = f"unparsed_{idx}"
                unparsed_data[sample_id] = {
                    "error_number": item['number'],
                    "error_passage": item['passage'],
                    "dataset": item['dataset'],
                    "operation": item['operation'],
                    "error_annotation": item['error_annotation'],
                    # 以下字段需补充（若有正确数据）
                    "correct_number": "",  # 需手动补充或从原始数据推导
                    "correct_passage": ""  # 需手动补充或从原始数据推导
                }
        
        domain = item['dataset']
        operation = item['operation']
        prompt_type = item['prompt_type']
        error_types = [k for k, v in item['error_annotation'].items() if v > 0]
        
        # 计算总体指标
        if pred == expected:
            if expected == 'yes':
                metrics['TP'] += 1
                for et in error_types:
                    detailed_metrics['by_error_type'][et]['TP'] += 1
                detailed_metrics['by_domain'][domain]['TP'] += 1
                detailed_metrics['by_operation'][operation]['TP'] += 1
                detailed_metrics['by_prompt_type'][prompt_type]['TP'] += 1
            else:  # expected == 'no'
                metrics['TN'] += 1
                for et in error_types:
                    detailed_metrics['by_error_type'][et]['TN'] += 1
                detailed_metrics['by_domain'][domain]['TN'] += 1
                detailed_metrics['by_operation'][operation]['TN'] += 1
                detailed_metrics['by_prompt_type'][prompt_type]['TN'] += 1
        else:
            if expected == 'yes':
                metrics['FN'] += 1
                for et in error_types:
                    detailed_metrics['by_error_type'][et]['FN'] += 1
                detailed_metrics['by_domain'][domain]['FN'] += 1
                detailed_metrics['by_operation'][operation]['FN'] += 1
                detailed_metrics['by_prompt_type'][prompt_type]['FN'] += 1
            else:  # expected == 'no'
                metrics['FP'] += 1
                for et in error_types:
                    detailed_metrics['by_error_type'][et]['FP'] += 1
                detailed_metrics['by_domain'][domain]['FP'] += 1
                detailed_metrics['by_operation'][operation]['FP'] += 1
                detailed_metrics['by_prompt_type'][prompt_type]['FP'] += 1
        
        if pred == 'generation_error':
            metrics['Generation Error'] += 1
            for et in error_types:
                detailed_metrics['by_error_type'][et]['Generation Error'] += 1
            detailed_metrics['by_domain'][domain]['Generation Error'] += 1
            detailed_metrics['by_operation'][operation]['Generation Error'] += 1
            detailed_metrics['by_prompt_type'][prompt_type]['Generation Error'] += 1
    
    # 保存无法解析的数据到 JSON
    if unparsed_data:
        with open(unparsed_output_file, 'w', encoding='utf-8') as f:
            json.dump(unparsed_data, f, indent=2, ensure_ascii=False)
        # print(f"无法解析的 {len(unparsed_data)} 条数据已保存到 {unparsed_output_file}")
        print("注意：JSON 文件仅包含 expected_answer='Yes' 的样本，correct_number 和 correct_passage 需手动补充")
    else:
        print("没有无法解析的数据")
    
    total = len(data_list)
    metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / total if total > 0 else 0
    return metrics, detailed_metrics

In [3]:
# 读取 predictions.jsonl
data_list = []
input_file = 'xiaomi_predictions.jsonl'  # 确认路径
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        data_list.append(data)

# 评测模型并保存无法解析的数据
unparsed_output_file = 'xiaomi_unparsed_predictions.json'
metrics, detailed_metrics = evaluate_model(data_list, unparsed_output_file)

# 打印总体指标
print("\nOverall Metrics:")
total = len(data_list)
for key, value in metrics.items():
    if key == 'Accuracy':
        print(f"{key}: {value:.3f}")
    else:
        print(f"{key}: {value} ({value / total:.3f})")

# 打印分维度指标
print("\nMetrics by Domain:")
for domain, counts in detailed_metrics['by_domain'].items():
    print(f"{domain}: {dict(counts)}")

print("\nMetrics by Error Type:")
for error_type, counts in detailed_metrics['by_error_type'].items():
    print(f"{error_type}: {dict(counts)}")

print("\nMetrics by Operation:")
for operation, counts in detailed_metrics['by_operation'].items():
    print(f"{operation}: {dict(counts)}")

print("\nMetrics by Prompt Type:")
for prompt_type, counts in detailed_metrics['by_prompt_type'].items():
    print(f"{prompt_type}: {dict(counts)}")

注意：JSON 文件仅包含 expected_answer='Yes' 的样本，correct_number 和 correct_passage 需手动补充

Overall Metrics:
FP: 4619 (0.481)
TP: 4347 (0.453)
TN: 181 (0.019)
FN: 453 (0.047)
Generation Error: 634 (0.066)
Accuracy: 0.472

Metrics by Domain:
Numeracy_600K_article_title: {'FP': 994, 'TP': 997, 'TN': 6, 'FN': 3, 'Generation Error': 1}
aclsent: {'FP': 904, 'TP': 921, 'TN': 44, 'FN': 27, 'Generation Error': 8}
DROP: {'FP': 921, 'TP': 669, 'TN': 73, 'FN': 325, 'Generation Error': 550}
qa-text-source-comparison: {'FP': 872, 'TP': 867, 'FN': 57, 'TN': 52, 'Generation Error': 35}
FinNum: {'FP': 928, 'TP': 893, 'FN': 41, 'Generation Error': 40, 'TN': 6}

Metrics by Error Type:
Error in Number Relationships: {'FP': 190, 'TP': 187, 'TN': 6, 'FN': 9, 'Generation Error': 7}
Undetectable Error: {'FP': 457, 'TP': 446, 'TN': 7, 'FN': 18, 'Generation Error': 24}
Type Error: {'FP': 502, 'TP': 495, 'TN': 16, 'FN': 23, 'Generation Error': 23}
Anomaly: {'FP': 216, 'TP': 214, 'TN': 14, 'FN': 16, 'Generation Error': 15}
