### 1. Prepare the prompt:

In [12]:
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")
frame_ds[0]  # 查看一个样本

Total frames: 5884


{'prompt': 'Observation Grid:\n[0,0]: depth=0.00, sem=void; [0,1]: depth=0.00, sem=void; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void\n[1,0]: depth=0.00, sem=void; [1,1]: depth=0.00, sem=void; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void\n[2,0]: depth=2.37, sem=void; [2,1]: depth=0.09, sem=void; [2,2]: depth=0.00, sem=void; [2,3]: depth=0.00, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void\n[3,0]: depth=4.81, sem=wall; [3,1]: depth=0.28, sem=void; [3,2]: depth=0.00, sem=void; [3,3]: depth=0.00, sem=void; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void\n[4,0]: depth=2.38, sem=void; [4,1]: depth=0.05, sem=void; [4,2]: depth=0.00, sem=void; [4,3]: depth=0.00, sem=void; [4,4]: depth=0.00, sem=void; [4,5]: depth=0.00, sem=void\n[5,0]: depth=0.00, sem=void; [5,1]: depth=0.00, sem=void; [5,2]: depth=0.00, sem=void; [5,3]: depth

In [13]:
print("Prompt:")
print("------------------------------>")
print(frame_ds[0]['prompt'])  # 查看对应的文本提示
print("+" * 30)
print("Ground Truth Action:")  # 查看对应的动作标签
print("------------------------------>")
print(frame_ds[0]['action'])

Prompt:
------------------------------>
Observation Grid:
[0,0]: depth=0.00, sem=void; [0,1]: depth=0.00, sem=void; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=0.00, sem=void; [1,1]: depth=0.00, sem=void; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=2.37, sem=void; [2,1]: depth=0.09, sem=void; [2,2]: depth=0.00, sem=void; [2,3]: depth=0.00, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=4.81, sem=wall; [3,1]: depth=0.28, sem=void; [3,2]: depth=0.00, sem=void; [3,3]: depth=0.00, sem=void; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=2.38, sem=void; [4,1]: depth=0.05, sem=void; [4,2]: depth=0.00, sem=void; [4,3]: depth=0.00, sem=void; [4,4]: depth=0.00, sem=void; [4,5]: depth=0.00, sem=void
[5,0]: depth=0.00, sem=void; [5,1]: depth=0.00, sem=void; [5,2]: depth=0.00, 

### 2. Inference example:

In [14]:
prompt = frame_ds[0]['prompt']


In [2]:
# inference.py
import os
import torch
from transformers import AutoTokenizer
from l2am.model_zoo import WeightedSequenceClassifier  # 确保能导入
import numpy as np
from safetensors.torch import load_file

# ======================
# 配置
# ======================
MODEL_CHECKPOINT = "data/l2a_longformer_action_classifier/checkpoint-81000"
HF_CACHE_DIR = "data/hf_model_cache"
MAX_LENGTH = 1024

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 41  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']



def main(hf_cache_dir=HF_CACHE_DIR, model_checkpoint=MODEL_CHECKPOINT, max_length=MAX_LENGTH):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Step 1: 加载 tokenizer（从 checkpoint 目录或原始模型）
    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoint,
        cache_dir=hf_cache_dir,
        clean_up_tokenization_spaces=True
    )

    # Step 2: 推断 num_labels（根据你的任务，已知是 4 类？）
    num_labels = 4  # class_0 ~ class_3 

    # Step 3: 重建模型结构（必须与训练时完全一致）
    # 注意：class_weights 在推理时不影响前向传播（只用于 loss），所以可以传 dummy 值
    dummy_class_weights = torch.ones(num_labels)  # 推理时 loss 不计算，权重无影响

    model = WeightedSequenceClassifier(
        model_name="allenai/longformer-base-4096",  # 或从 checkpoint 加载 config
        num_labels=num_labels,
        class_weights=dummy_class_weights,
        cache_dir=hf_cache_dir,
    )

    # Step 4: 加载训练好的权重
    
    model_file = os.path.join(model_checkpoint, "model.safetensors")
    state_dict = load_file(model_file, device=str(device))
    model.load_state_dict(state_dict, strict=True)
    model.to(device)
    model.eval()

    # Step 5: Tokenize & 推理
    inputs = tokenizer(
        EXAMPLE_PROMPT,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # logits = outputs.logits
        logits = outputs["logits"]
        pred_class = torch.argmax(logits, dim=-1).item()

    print("\n" + "="*50)
    print("Prompt:")
    print(EXAMPLE_PROMPT)
    print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
    print("Predicted Action:", pred_class)
    print("="*50)

    if pred_class == GROUND_TRUTH_ACTION:
        print("✅ Prediction matches ground truth!")
    else:
        print("❌ Prediction differs from ground truth.")


if __name__ == "__main__":
    main()

Total frames: 5884
Using device: cuda


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing global attention on CLS token...



Prompt:
Observation Grid:
[0,0]: depth=0.67, sem=wall; [0,1]: depth=2.05, sem=wall; [0,2]: depth=5.54, sem=void; [0,3]: depth=5.24, sem=ceiling; [0,4]: depth=4.81, sem=void; [0,5]: depth=3.20, sem=wall
[1,0]: depth=0.67, sem=wall; [1,1]: depth=2.01, sem=wall; [1,2]: depth=4.70, sem=wall; [1,3]: depth=2.86, sem=wall; [1,4]: depth=2.85, sem=wall; [1,5]: depth=0.62, sem=wall
[2,0]: depth=0.67, sem=wall; [2,1]: depth=1.34, sem=stairs; [2,2]: depth=1.71, sem=stairs; [2,3]: depth=1.30, sem=wall; [2,4]: depth=0.75, sem=wall; [2,5]: depth=0.52, sem=wall
[3,0]: depth=0.65, sem=wall; [3,1]: depth=0.86, sem=stairs; [3,2]: depth=0.91, sem=stairs; [3,3]: depth=0.94, sem=stairs; [3,4]: depth=0.74, sem=wall; [3,5]: depth=0.52, sem=wall
[4,0]: depth=0.54, sem=stairs; [4,1]: depth=0.58, sem=stairs; [4,2]: depth=0.61, sem=stairs; [4,3]: depth=0.65, sem=stairs; [4,4]: depth=0.66, sem=stairs; [4,5]: depth=0.52, sem=wall
[5,0]: depth=0.44, sem=stairs; [5,1]: depth=0.44, sem=stairs; [5,2]: depth=0.44, sem=

#### 2.1 inference with bigbird (single step)

In [3]:
# inference.py
import os
import torch
from transformers import AutoTokenizer
from transformers import BigBirdTokenizer
from l2am.model_zoo import WeightedSequenceClassifier  # 确保能导入
import numpy as np
from safetensors.torch import load_file

# ======================
# 配置
# ======================
MODEL_CHECKPOINT = "data/l2a_bigbird_action_classifier1/checkpoint-1500"
MODEL_NAME = "google/bigbird-roberta-base"
HF_CACHE_DIR = "data/hf_model_cache"
MAX_LENGTH = 1024

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']



def main(hf_cache_dir=HF_CACHE_DIR, model_checkpoint=MODEL_CHECKPOINT, max_length=MAX_LENGTH):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    tokenizer = BigBirdTokenizer.from_pretrained(
        model_checkpoint,
        cache_dir=HF_CACHE_DIR,
        clean_up_tokenization_spaces=True,
    )

    # Step 2: 推断 num_labels（根据你的任务，已知是 4 类？）
    num_labels = 4  # class_0 ~ class_3 

    # Step 3: 重建模型结构（必须与训练时完全一致）
    # 注意：class_weights 在推理时不影响前向传播（只用于 loss），所以可以传 dummy 值
    dummy_class_weights = torch.ones(num_labels)  # 推理时 loss 不计算，权重无影响

    model = WeightedSequenceClassifier(
        model_name=MODEL_NAME,  # 或从 checkpoint 加载 config
        num_labels=num_labels,
        class_weights=dummy_class_weights,
        cache_dir=hf_cache_dir,
    )

    # Step 4: 加载训练好的权重
    
    model_safetensors = os.path.join(model_checkpoint, "model.safetensors")
    model_bin = os.path.join(model_checkpoint, "pytorch_model.bin")

    if os.path.exists(model_safetensors):
        print(f"Loading weights from safetensors: {model_safetensors}")
        state_dict = load_file(model_safetensors, device=str(device))
    elif os.path.exists(model_bin):
        print(f"Loading weights from pytorch_model.bin: {model_bin}")
        state_dict = torch.load(model_bin, map_location=device)
    else:
        raise FileNotFoundError(
            f"Neither 'model.safetensors' nor 'pytorch_model.bin' found in {model_checkpoint}"
        )

    model.load_state_dict(state_dict, strict=True)

    # model_file = os.path.join(model_checkpoint, "model.safetensors")
    # state_dict = load_file(model_file, device=str(device))
    model.load_state_dict(state_dict, strict=True)
    model.to(device)
    model.eval()

    # Step 5: Tokenize & 推理
    inputs = tokenizer(
        EXAMPLE_PROMPT,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # logits = outputs.logits
        logits = outputs["logits"]
        pred_class = torch.argmax(logits, dim=-1).item()

    print("\n" + "="*50)
    print("Prompt:")
    print(EXAMPLE_PROMPT)
    print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
    print("Predicted Action:", pred_class)
    print("="*50)

    if pred_class == GROUND_TRUTH_ACTION:
        print("✅ Prediction matches ground truth!")
    else:
        print("❌ Prediction differs from ground truth.")


if __name__ == "__main__":
    main()

Total frames: 5884
Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-1500/pytorch_model.bin

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void; [4,4]: depth=0.00, sem=void; [4,5]: depth=

#### 2.2 inference with bigbird with api (single step)

In [2]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-4500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-4500/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void; 

In [2]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-40500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-40500/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void;

In [1]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-88500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-88500/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void;

In [1]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-101000",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-101000/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void

In [1]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-156500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-156500/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void

In [1]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-248000",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4,  # 并行加速（可选）
        load_from_cache_file=False  # ← 关键！强制重新计算
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

# 单次预测
prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")



# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-248000/pytorch_model.bin
Total frames: 5884

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void

In [25]:
from l2am.inference_api import L2AMActionClassifier

# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-461500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch, get_or_create_dataset
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0001.json"
raw_ds = load_dataset("json", data_files=data_path, split="train")
print("Loaded JSON files:")
# 应用转换
print("Expanding episodes to frames...")
frame_ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=16,  # 并行加速（可选）
        load_from_cache_file=False  # ← 关键！强制重新计算
    )


DATA_DIR = "data/l2am_r2r_test"
CACHE_DIR = "data/cache/train_frames1_test"
ds = get_or_create_dataset(DATA_DIR, CACHE_DIR)

print(ds==frame_ds)
print(f"Total frames: {len(frame_ds)}")

# 批量预测
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")



[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-461500/pytorch_model.bin
Loaded JSON files:
Expanding episodes to frames...


Building text prompts (num_proc=16): 100%|██████████| 100/100 [00:00<00:00, 148.74 examples/s]


Loading cached dataset from data/cache/train_frames1_test
False
Total frames: 5806

Batch Prediction Results:
Sample 0: Predicted Action = 1, Ground Truth Action = 3
Sample 1: Predicted Action = 2, Ground Truth Action = 3
Sample 2: Predicted Action = 2, Ground Truth Action = 3
Sample 3: Predicted Action = 3, Ground Truth Action = 3
Sample 4: Predicted Action = 3, Ground Truth Action = 3
Sample 5: Predicted Action = 3, Ground Truth Action = 3
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 1, Ground Truth Action = 1
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth

In [27]:
# 批量预测
# Step 2: 划分训练/验证集
ds = ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds["train"]
eval_ds = ds["test"]
num_batch = int(len(eval_ds)/10)
EXAMPLE_PROMPTS = [
    eval_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    eval_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 1, Ground Truth Action = 1
Sample 1: Predicted Action = 3, Ground Truth Action = 3
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 3, Ground Truth Action = 3
Sample 4: Predicted Action = 2, Ground Truth Action = 2
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 3, Ground Truth Action = 3
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 1, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 2, Ground Truth Action = 2
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 3, Ground Truth Action = 1
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 2, Ground Truth Action = 2
Sample 17: Pre

In [3]:
ds

Dataset({
    features: ['prompt', 'action'],
    num_rows: 5806
})

In [4]:
frame_ds

Dataset({
    features: ['prompt', 'action'],
    num_rows: 5806
})

In [6]:
# 检查两者是否相等，包括内容和顺序
print(ds==frame_ds)


False


#### 直接从训练预构建的prompt来推理评测对比

In [1]:
from l2am.inference_api import L2AMActionClassifier
from l2am.dataset_utils import get_or_create_dataset
# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-461500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

DATA_DIR = "data/l2am_r2r_test"
CACHE_DIR = "data/cache/train_frames_test"
ds = get_or_create_dataset(DATA_DIR, CACHE_DIR)

# Step 2: 划分训练/验证集
ds = ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds["train"]
eval_ds = ds["test"]


  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-461500/pytorch_model.bin
Loading cached dataset from data/cache/train_frames_test


In [19]:
train_ds

Dataset({
    features: ['prompt', 'action'],
    num_rows: 5515
})

In [20]:
print(train_ds[0]['prompt'])
print(train_ds[0]['action'])

Observation Grid:
[0,0]: depth=4.40, sem=ceiling; [0,1]: depth=4.41, sem=ceiling; [0,2]: depth=4.41, sem=ceiling; [0,3]: depth=3.48, sem=wall; [0,4]: depth=1.99, sem=wall; [0,5]: depth=1.38, sem=wall
[1,0]: depth=7.28, sem=ceiling; [1,1]: depth=7.41, sem=ceiling; [1,2]: depth=7.16, sem=ceiling; [1,3]: depth=3.65, sem=wall; [1,4]: depth=1.99, sem=wall; [1,5]: depth=1.38, sem=wall
[2,0]: depth=8.68, sem=window; [2,1]: depth=8.52, sem=window; [2,2]: depth=9.09, sem=wall; [2,3]: depth=3.41, sem=wall; [2,4]: depth=1.96, sem=wall; [2,5]: depth=1.38, sem=wall
[3,0]: depth=4.37, sem=floor; [3,1]: depth=4.67, sem=floor; [3,2]: depth=4.37, sem=floor; [3,3]: depth=2.55, sem=floor; [3,4]: depth=1.90, sem=wall; [3,5]: depth=1.36, sem=wall
[4,0]: depth=1.18, sem=floor; [4,1]: depth=1.18, sem=floor; [4,2]: depth=1.18, sem=floor; [4,3]: depth=1.18, sem=floor; [4,4]: depth=1.19, sem=floor; [4,5]: depth=1.13, sem=floor
[5,0]: depth=0.69, sem=floor; [5,1]: depth=0.70, sem=floor; [5,2]: depth=0.70, sem=fl

In [21]:
eval_ds

Dataset({
    features: ['prompt', 'action'],
    num_rows: 291
})

In [4]:

# 批量预测
frame_ds = eval_ds
num_batch = int(len(frame_ds)/1)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch_fast(prompts=EXAMPLE_PROMPTS, batch_size=128)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 207.25 MiB is free. Process 2074561 has 15.60 GiB memory in use. Including non-PyTorch memory, this process has 7.80 GiB memory in use. Of the allocated memory 7.10 GiB is allocated by PyTorch, and 260.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:

# 批量预测
frame_ds = eval_ds
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 1, Ground Truth Action = 1
Sample 1: Predicted Action = 3, Ground Truth Action = 3
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 3, Ground Truth Action = 3
Sample 4: Predicted Action = 2, Ground Truth Action = 2
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 3, Ground Truth Action = 3
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 1, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 2, Ground Truth Action = 2
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 3, Ground Truth Action = 1
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 2, Ground Truth Action = 2
Sample 17: Pre

In [16]:

# 批量预测
frame_ds = train_ds
num_batch = int(len(frame_ds)/1000)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch_fast(prompts=EXAMPLE_PROMPTS, batch_size=128)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 2, Ground Truth Action = 2
Sample 1: Predicted Action = 1, Ground Truth Action = 1
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 1, Ground Truth Action = 1
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 3, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 2, Ground Truth Action = 2
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 3, Ground Truth Action = 3
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre

In [6]:

# 批量预测
frame_ds = train_ds
num_batch = int(len(frame_ds)/1000)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 2, Ground Truth Action = 2
Sample 1: Predicted Action = 1, Ground Truth Action = 1
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 1, Ground Truth Action = 1
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 3, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 2, Ground Truth Action = 2
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 3, Ground Truth Action = 3
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre

### 3. Action chunk inference with bigbird

In [16]:
# inference.py
import os
import torch
from transformers import AutoTokenizer
from l2am.model_zoo import MultiStepWeightedClassifier  # 确保能导入
import numpy as np
from safetensors.torch import load_file

# ======================
# 配置
# ======================
MODEL_CHECKPOINT = "data/l2a_bigbird_action_classifier_chunk4/checkpoint-185500"
HF_CACHE_DIR = "data/hf_model_cache"
MAX_LENGTH = 1024
MODEL_NAME = "google/bigbird-roberta-base"  # 可替换为 roberta-base、 bert-base-uncased、allenai/longformer-base-4096、google/bigbird-roberta-base等

NUM_CHUNK = 4  # 与训练时保持一致

# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch_chunk_v1
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
# 应用转换
frame_ds = raw_ds.map(
        prepare_text_samples_batch_chunk_v1,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=16  # 并行加速（可选）
    )
print(f"Total frames: {len(frame_ds)}")

test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']

GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']



def main(hf_cache_dir=HF_CACHE_DIR, model_checkpoint=MODEL_CHECKPOINT, max_length=MAX_LENGTH):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Step 1: 加载 tokenizer（从 checkpoint 目录或原始模型）
    from transformers import BigBirdTokenizer

    tokenizer = BigBirdTokenizer.from_pretrained(
        model_checkpoint,
        cache_dir=hf_cache_dir,
        clean_up_tokenization_spaces=True,
    )

    # Step 2: 推断 num_labels（根据你的任务，已知是 4 类？）
    num_labels = 4  # class_0 ~ class_3 

    # Step 3: 重建模型结构（必须与训练时完全一致）
    # 注意：class_weights 在推理时不影响前向传播（只用于 loss），所以可以传 dummy 值
    dummy_class_weights = torch.ones(num_labels)  # 推理时 loss 不计算，权重无影响

    
    model = MultiStepWeightedClassifier(
        MODEL_NAME,
        num_labels=num_labels,
        class_weights=dummy_class_weights,
        num_steps=NUM_CHUNK,
        cache_dir=hf_cache_dir,
    )

    # Step 4: 加载训练好的权重
    model_safetensors = os.path.join(model_checkpoint, "model.safetensors")
    model_bin = os.path.join(model_checkpoint, "pytorch_model.bin")

    if os.path.exists(model_safetensors):
        state_dict = load_file(model_safetensors, device=str(device))
    elif os.path.exists(model_bin):
        state_dict = torch.load(model_bin, map_location=device)
    else:
        raise FileNotFoundError("Neither 'model.safetensors' nor 'pytorch_model.bin' found in checkpoint directory.")

    model.load_state_dict(state_dict, strict=True)
    model.to(device)
    model.eval()

    # Step 5: Tokenize & 推理
    inputs = tokenizer(
        EXAMPLE_PROMPT,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs["logits"]
        # 对每个 step 取 argmax -> (B, num_steps)
        pred_classes = torch.argmax(logits, dim=-1)  # 注意：不是 .item()
        # pred_class = torch.argmax(logits, dim=-1).item()
         # 如果 batch_size == 1，可以 squeeze 得到 (num_steps,)
        if pred_classes.shape[0] == 1:
            pred_classes = pred_classes.squeeze(0)  # shape: (num_steps,)

        # 转为 Python list（便于打印或比较）
        pred_list = pred_classes.cpu().tolist()

    print("\n" + "="*50)
    print("Prompt:")
    print(EXAMPLE_PROMPT)
    print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
    print("Predicted Action:", pred_list)
    print("="*50)

    if pred_list == GROUND_TRUTH_ACTION:
        print("✅ Prediction matches ground truth!")
    else:
        print("❌ Prediction differs from ground truth.")


if __name__ == "__main__":
    main()

Total frames: 5884
Using device: cuda

Prompt:
Observation Grid:
[0,0]: depth=1.71, sem=wall; [0,1]: depth=0.75, sem=window; [0,2]: depth=0.00, sem=void; [0,3]: depth=0.00, sem=void; [0,4]: depth=0.00, sem=void; [0,5]: depth=0.00, sem=void
[1,0]: depth=1.63, sem=wall; [1,1]: depth=0.74, sem=window; [1,2]: depth=0.00, sem=void; [1,3]: depth=0.00, sem=void; [1,4]: depth=0.00, sem=void; [1,5]: depth=0.00, sem=void
[2,0]: depth=1.62, sem=wall; [2,1]: depth=1.07, sem=window; [2,2]: depth=3.68, sem=void; [2,3]: depth=1.50, sem=void; [2,4]: depth=0.00, sem=void; [2,5]: depth=0.00, sem=void
[3,0]: depth=1.60, sem=wall; [3,1]: depth=1.25, sem=window; [3,2]: depth=6.75, sem=window; [3,3]: depth=3.86, sem=wall; [3,4]: depth=0.00, sem=void; [3,5]: depth=0.00, sem=void
[4,0]: depth=1.34, sem=floor; [4,1]: depth=1.08, sem=window; [4,2]: depth=1.23, sem=void; [4,3]: depth=0.70, sem=void; [4,4]: depth=0.00, sem=void; [4,5]: depth=0.00, sem=void
[5,0]: depth=0.82, sem=floor; [5,1]: depth=0.75, sem=wind

### 4. Action chunk inference with bigbird (using class api)

In [2]:
from l2am.inference_api import ActionChunkPredictor
from l2am.dataset_utils import prepare_text_samples_batch_chunk_v1
from datasets import load_dataset

def main():
    # ======================
    # 配置
    # ======================
    MODEL_CHECKPOINT = "data/l2a_bigbird_action_classifier_chunk4/checkpoint-251000"
    HF_CACHE_DIR = "data/hf_model_cache"
    MAX_LENGTH = 1024
    NUM_CHUNK = 4
    NUM_LABELS = 4

    # 加载测试样本
    data_path = "data/l2am_r2r/episodes_part_0109.json"
    raw_ds = load_dataset("json", data_files=data_path)["train"]
    frame_ds = raw_ds.map(
        prepare_text_samples_batch_chunk_v1,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=16,
    )
    print(f"Total frames: {len(frame_ds)}")

    # 初始化预测器
    predictor = ActionChunkPredictor(
        model_checkpoint=MODEL_CHECKPOINT,
        hf_cache_dir=HF_CACHE_DIR,
        num_labels=NUM_LABELS,
        num_steps=NUM_CHUNK,
        max_length=MAX_LENGTH,
    )
    
    test_frame_id = 35
    EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
    GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

    # 推理
    pred_action = predictor.predict(EXAMPLE_PROMPT)
    pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

    # 输出结果
    print("\n" + "=" * 50)
    print("Prompt:")
    print(EXAMPLE_PROMPT)
    print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
    print("Predicted Action:     ", pred_action)
    print("Predicted Action (Clean):     ", pred_action_clean)
    print("=" * 50)

    if pred_action == GROUND_TRUTH_ACTION:
        print("✅ Prediction matches ground truth!")
    else:
        print("❌ Prediction differs from ground truth.")


if __name__ == "__main__":
    main()

Total frames: 1110
[ActionChunkPredictor] Using device: cuda
Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier_chunk4/checkpoint-251000/pytorch_model.bin

Prompt:
Observation Grid:
[0,0]: depth=2.06, sem=ceiling; [0,1]: depth=2.06, sem=ceiling; [0,2]: depth=2.06, sem=ceiling; [0,3]: depth=1.65, sem=ceiling; [0,4]: depth=0.94, sem=column; [0,5]: depth=0.94, sem=wall
[1,0]: depth=2.63, sem=wall; [1,1]: depth=2.91, sem=wall; [1,2]: depth=2.96, sem=wall; [1,3]: depth=2.38, sem=column; [1,4]: depth=0.94, sem=column; [1,5]: depth=0.84, sem=wall
[2,0]: depth=3.59, sem=wall; [2,1]: depth=5.57, sem=wall; [2,2]: depth=4.36, sem=wall; [2,3]: depth=3.95, sem=wall; [2,4]: depth=0.94, sem=column; [2,5]: depth=0.95, sem=wall
[3,0]: depth=2.90, sem=wall; [3,1]: depth=3.46, sem=floor; [3,2]: depth=3.16, sem=floor; [3,3]: depth=2.88, sem=column; [3,4]: depth=0.95, sem=column; [3,5]: depth=0.95, sem=wall
[4,0]: depth=1.38, sem=floor; [4,1]: depth=1.38, sem=floor; [4,2]: depth=1.3

#### 4.1 Cases

In [1]:
from l2am.inference_api import ActionChunkPredictor
from l2am.dataset_utils import prepare_text_samples_batch_chunk_v1
from datasets import load_dataset


MODEL_CHECKPOINT = "/mnt/slurmfs-4090node1/homes/dpeng108/l2am_ws/outputs/l2a_bigbird_action_classifier3_chunk4/checkpoint-287500"
HF_CACHE_DIR = "data/hf_model_cache"
MAX_LENGTH = 1024
NUM_CHUNK = 4
NUM_LABELS = 4

# 加载测试样本
data_path = "data/l2am_r2r/episodes_part_0109.json"
raw_ds = load_dataset("json", data_files=data_path)["train"]
frame_ds = raw_ds.map(
    prepare_text_samples_batch_chunk_v1,
    batched=True,
    remove_columns=raw_ds.column_names,
    desc="Building text prompts",
    num_proc=16,
    load_from_cache_file=False,  # ← 关键！强制重新计算
)
print(f"Total frames: {len(frame_ds)}")

# 初始化预测器
predictor = ActionChunkPredictor(
    model_checkpoint=MODEL_CHECKPOINT,
    hf_cache_dir=HF_CACHE_DIR,
    num_labels=NUM_LABELS,
    num_steps=NUM_CHUNK,
    max_length=MAX_LENGTH,
)

# 测试另一个样本
test_frame_id = 2
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


  from .autonotebook import tqdm as notebook_tqdm
Building text prompts (num_proc=16): 100%|██████████| 19/19 [00:00<00:00, 29.37 examples/s]


Total frames: 1110
[ActionChunkPredictor] Using device: cuda
Loading weights from pytorch_model.bin: /mnt/slurmfs-4090node1/homes/dpeng108/l2am_ws/outputs/l2a_bigbird_action_classifier3_chunk4/checkpoint-287500/pytorch_model.bin

Prompt:
Observation Grid:
[0,0]: depth=2.57, sem=beam; [0,1]: depth=2.58, sem=ceiling; [0,2]: depth=2.62, sem=ceiling; [0,3]: depth=2.56, sem=beam; [0,4]: depth=2.50, sem=ceiling; [0,5]: depth=1.84, sem=stairs
[1,0]: depth=4.11, sem=beam; [1,1]: depth=4.21, sem=ceiling; [1,2]: depth=4.11, sem=ceiling; [1,3]: depth=3.82, sem=beam; [1,4]: depth=4.01, sem=ceiling; [1,5]: depth=2.70, sem=stairs
[2,0]: depth=4.77, sem=floor; [2,1]: depth=4.83, sem=fireplace; [2,2]: depth=4.46, sem=fireplace; [2,3]: depth=3.84, sem=objects; [2,4]: depth=1.95, sem=railing; [2,5]: depth=2.70, sem=stairs
[3,0]: depth=3.23, sem=wall; [3,1]: depth=3.22, sem=wall; [3,2]: depth=3.21, sem=wall; [3,3]: depth=2.54, sem=objects; [3,4]: depth=1.73, sem=railing; [3,5]: depth=2.74, sem=railing
[4

In [2]:
# 测试另一个样本
test_frame_id = 40
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=1.84, sem=wall; [0,1]: depth=1.71, sem=wall; [0,2]: depth=1.74, sem=wall; [0,3]: depth=1.85, sem=wall; [0,4]: depth=1.77, sem=wall; [0,5]: depth=1.34, sem=wall
[1,0]: depth=2.08, sem=wall; [1,1]: depth=2.81, sem=ceiling; [1,2]: depth=1.78, sem=wall; [1,3]: depth=3.00, sem=ceiling; [1,4]: depth=2.43, sem=ceiling; [1,5]: depth=1.34, sem=wall
[2,0]: depth=2.17, sem=wall; [2,1]: depth=4.82, sem=wall; [2,2]: depth=1.78, sem=wall; [2,3]: depth=4.91, sem=door; [2,4]: depth=3.79, sem=wall; [2,5]: depth=1.34, sem=wall
[3,0]: depth=2.16, sem=wall; [3,1]: depth=3.45, sem=floor; [3,2]: depth=1.78, sem=wall; [3,3]: depth=3.27, sem=floor; [3,4]: depth=2.80, sem=wall; [3,5]: depth=1.35, sem=wall
[4,0]: depth=1.47, sem=floor; [4,1]: depth=1.46, sem=floor; [4,2]: depth=1.42, sem=floor; [4,3]: depth=1.47, sem=floor; [4,4]: depth=1.44, sem=floor; [4,5]: depth=1.25, sem=wall
[5,0]: depth=0.86, sem=floor; [5,1]: depth=0.86, sem=floor; [5,2]: depth=0.86, sem=floor; [5

In [3]:
# 测试另一个样本
test_frame_id = 2
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=2.57, sem=beam; [0,1]: depth=2.58, sem=ceiling; [0,2]: depth=2.62, sem=ceiling; [0,3]: depth=2.56, sem=beam; [0,4]: depth=2.50, sem=ceiling; [0,5]: depth=1.84, sem=stairs
[1,0]: depth=4.11, sem=beam; [1,1]: depth=4.21, sem=ceiling; [1,2]: depth=4.11, sem=ceiling; [1,3]: depth=3.82, sem=beam; [1,4]: depth=4.01, sem=ceiling; [1,5]: depth=2.70, sem=stairs
[2,0]: depth=4.77, sem=floor; [2,1]: depth=4.83, sem=fireplace; [2,2]: depth=4.46, sem=fireplace; [2,3]: depth=3.84, sem=objects; [2,4]: depth=1.95, sem=railing; [2,5]: depth=2.70, sem=stairs
[3,0]: depth=3.23, sem=wall; [3,1]: depth=3.22, sem=wall; [3,2]: depth=3.21, sem=wall; [3,3]: depth=2.54, sem=objects; [3,4]: depth=1.73, sem=railing; [3,5]: depth=2.74, sem=railing
[4,0]: depth=1.15, sem=wall; [4,1]: depth=1.15, sem=wall; [4,2]: depth=1.15, sem=wall; [4,3]: depth=1.14, sem=wall; [4,4]: depth=1.19, sem=wall; [4,5]: depth=1.62, sem=stairs
[5,0]: depth=0.66, sem=wall; [5,1]: depth=0.66, sem=wall

In [4]:
# 测试另一个样本
test_frame_id = 3
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=2.57, sem=ceiling; [0,1]: depth=2.58, sem=ceiling; [0,2]: depth=2.57, sem=ceiling; [0,3]: depth=2.61, sem=ceiling; [0,4]: depth=2.57, sem=ceiling; [0,5]: depth=2.57, sem=beam
[1,0]: depth=3.99, sem=ceiling; [1,1]: depth=4.37, sem=beam; [1,2]: depth=4.31, sem=ceiling; [1,3]: depth=3.82, sem=ceiling; [1,4]: depth=3.76, sem=beam; [1,5]: depth=3.26, sem=ceiling
[2,0]: depth=4.72, sem=wall; [2,1]: depth=5.81, sem=floor; [2,2]: depth=5.25, sem=fireplace; [2,3]: depth=3.84, sem=fireplace; [2,4]: depth=3.93, sem=objects; [2,5]: depth=1.41, sem=railing
[3,0]: depth=2.09, sem=chair; [3,1]: depth=3.39, sem=wall; [3,2]: depth=3.37, sem=wall; [3,3]: depth=2.92, sem=wall; [3,4]: depth=2.51, sem=objects; [3,5]: depth=1.41, sem=railing
[4,0]: depth=1.14, sem=wall; [4,1]: depth=1.14, sem=wall; [4,2]: depth=1.15, sem=wall; [4,3]: depth=1.15, sem=wall; [4,4]: depth=1.14, sem=wall; [4,5]: depth=1.15, sem=wall
[5,0]: depth=0.66, sem=wall; [5,1]: depth=0.66, sem=wall;

In [5]:
# 测试另一个样本
test_frame_id = 4
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=2.61, sem=ceiling; [0,1]: depth=2.61, sem=ceiling; [0,2]: depth=2.60, sem=ceiling; [0,3]: depth=2.62, sem=ceiling; [0,4]: depth=2.61, sem=ceiling; [0,5]: depth=2.58, sem=ceiling
[1,0]: depth=4.02, sem=ceiling; [1,1]: depth=4.45, sem=beam; [1,2]: depth=4.25, sem=ceiling; [1,3]: depth=3.76, sem=ceiling; [1,4]: depth=3.25, sem=objects; [1,5]: depth=2.18, sem=railing
[2,0]: depth=4.78, sem=wall; [2,1]: depth=5.46, sem=floor; [2,2]: depth=4.92, sem=fireplace; [2,3]: depth=3.88, sem=fireplace; [2,4]: depth=3.89, sem=objects; [2,5]: depth=1.45, sem=railing
[3,0]: depth=2.13, sem=chair; [3,1]: depth=3.28, sem=wall; [3,2]: depth=3.20, sem=wall; [3,3]: depth=2.91, sem=wall; [3,4]: depth=2.46, sem=objects; [3,5]: depth=1.24, sem=railing
[4,0]: depth=1.10, sem=wall; [4,1]: depth=1.10, sem=wall; [4,2]: depth=1.10, sem=wall; [4,3]: depth=1.10, sem=wall; [4,4]: depth=1.10, sem=wall; [4,5]: depth=1.02, sem=wall
[5,0]: depth=0.64, sem=wall; [5,1]: depth=0.64, sem

In [6]:
# 测试另一个样本
test_frame_id = 5
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=2.62, sem=ceiling; [0,1]: depth=2.61, sem=ceiling; [0,2]: depth=2.61, sem=ceiling; [0,3]: depth=2.61, sem=ceiling; [0,4]: depth=2.52, sem=ceiling; [0,5]: depth=2.57, sem=ceiling
[1,0]: depth=4.02, sem=wall; [1,1]: depth=4.42, sem=beam; [1,2]: depth=4.12, sem=fireplace; [1,3]: depth=3.74, sem=beam; [1,4]: depth=1.75, sem=objects; [1,5]: depth=2.80, sem=railing
[2,0]: depth=4.96, sem=wall; [2,1]: depth=5.18, sem=floor; [2,2]: depth=4.59, sem=fireplace; [2,3]: depth=4.17, sem=fireplace; [2,4]: depth=2.17, sem=objects; [2,5]: depth=3.23, sem=railing
[3,0]: depth=2.32, sem=chair; [3,1]: depth=3.19, sem=wall; [3,2]: depth=3.09, sem=wall; [3,3]: depth=3.02, sem=wall; [3,4]: depth=1.68, sem=objects; [3,5]: depth=1.97, sem=railing
[4,0]: depth=1.10, sem=wall; [4,1]: depth=1.10, sem=wall; [4,2]: depth=1.11, sem=wall; [4,3]: depth=1.11, sem=wall; [4,4]: depth=1.07, sem=wall; [4,5]: depth=1.02, sem=wall
[5,0]: depth=0.64, sem=wall; [5,1]: depth=0.64, sem=wal

In [None]:
# 测试另一个样本
test_frame_id = 6
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action_chunk']

# 推理
pred_action = predictor.predict(EXAMPLE_PROMPT)
pred_action_clean = predictor.predict_clean(EXAMPLE_PROMPT) # 自动处理action chunk中的特殊补全标记

# 输出结果
print("\n" + "=" * 50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:     ", pred_action)
print("Predicted Action (Clean):     ", pred_action_clean)
print("=" * 50)

if pred_action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=2.63, sem=ceiling; [0,1]: depth=2.62, sem=ceiling; [0,2]: depth=2.61, sem=ceiling; [0,3]: depth=2.61, sem=ceiling; [0,4]: depth=1.56, sem=objects; [0,5]: depth=2.59, sem=ceiling
[1,0]: depth=4.00, sem=wall; [1,1]: depth=4.30, sem=beam; [1,2]: depth=3.97, sem=fireplace; [1,3]: depth=3.71, sem=fireplace; [1,4]: depth=1.10, sem=objects; [1,5]: depth=2.98, sem=objects
[2,0]: depth=4.93, sem=column; [2,1]: depth=4.98, sem=floor; [2,2]: depth=4.26, sem=fireplace; [2,3]: depth=4.53, sem=fireplace; [2,4]: depth=1.56, sem=objects; [2,5]: depth=3.20, sem=objects
[3,0]: depth=2.58, sem=wall; [3,1]: depth=3.12, sem=wall; [3,2]: depth=2.97, sem=wall; [3,3]: depth=3.08, sem=wall; [3,4]: depth=1.26, sem=objects; [3,5]: depth=2.02, sem=objects
[4,0]: depth=1.10, sem=wall; [4,1]: depth=1.11, sem=wall; [4,2]: depth=1.11, sem=wall; [4,3]: depth=1.11, sem=wall; [4,4]: depth=1.01, sem=objects; [4,5]: depth=1.07, sem=wall
[5,0]: depth=0.64, sem=wall; [5,1]: depth=0.64

: 