### Use `get_or_create_dataset` for prompt fomulation

In [1]:
from l2am.inference_api import L2AMActionClassifier
from l2am.dataset_utils import get_or_create_dataset
# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-461500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)

  from .autonotebook import tqdm as notebook_tqdm


[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-461500/pytorch_model.bin


In [5]:
# Step 1: 加载数据集
DATA_DIR = "data/l2am_r2r"
CACHE_DIR = "data/cache/train_frames1_eval"
ds = get_or_create_dataset(DATA_DIR, CACHE_DIR)
# Step 2: 划分训练/验证集
ds = ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds["train"]
eval_ds = ds["test"]

Loading cached dataset from data/cache/train_frames1_eval


In [4]:
print(train_ds)
print(eval_ds)

Dataset({
    features: ['prompt', 'action'],
    num_rows: 599681
})
Dataset({
    features: ['prompt', 'action'],
    num_rows: 31563
})


In [6]:
# 批量预测
frame_ds = eval_ds
num_batch = int(len(frame_ds)/100)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch_fast(prompts=EXAMPLE_PROMPTS, batch_size=128)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 2, Ground Truth Action = 2
Sample 1: Predicted Action = 3, Ground Truth Action = 3
Sample 2: Predicted Action = 3, Ground Truth Action = 3
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 2, Ground Truth Action = 2
Sample 5: Predicted Action = 2, Ground Truth Action = 2
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 2, Ground Truth Action = 2
Sample 8: Predicted Action = 3, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 2, Ground Truth Action = 2
Sample 12: Predicted Action = 0, Ground Truth Action = 0
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre

In [13]:
# 批量预测
frame_ds = eval_ds
num_batch = int(len(frame_ds)/100)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 2, Ground Truth Action = 2
Sample 1: Predicted Action = 3, Ground Truth Action = 3
Sample 2: Predicted Action = 3, Ground Truth Action = 3
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 2, Ground Truth Action = 2
Sample 5: Predicted Action = 2, Ground Truth Action = 2
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 2, Ground Truth Action = 2
Sample 8: Predicted Action = 3, Ground Truth Action = 3
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 2, Ground Truth Action = 2
Sample 12: Predicted Action = 0, Ground Truth Action = 0
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre

In [12]:
# 单次预测
test_frame_id = 3  # 选择要测试的样本 ID
EXAMPLE_PROMPT = frame_ds[test_frame_id]['prompt']
GROUND_TRUTH_ACTION = frame_ds[test_frame_id]['action']

prompt = EXAMPLE_PROMPT
action = classifier.predict(prompt)
print("\n" + "="*50)
print("Prompt:")
print(EXAMPLE_PROMPT)
print("\nGround Truth Action:", GROUND_TRUTH_ACTION)
print("Predicted Action:", action)
print("="*50)

if action == GROUND_TRUTH_ACTION:
    print("✅ Prediction matches ground truth!")
else:
    print("❌ Prediction differs from ground truth.")


Prompt:
Observation Grid:
[0,0]: depth=4.84, sem=wall; [0,1]: depth=5.71, sem=misc; [0,2]: depth=5.03, sem=wall; [0,3]: depth=4.19, sem=wall; [0,4]: depth=2.13, sem=door; [0,5]: depth=1.40, sem=door
[1,0]: depth=4.39, sem=wall; [1,1]: depth=4.74, sem=wall; [1,2]: depth=4.50, sem=wall; [1,3]: depth=3.53, sem=wall; [1,4]: depth=2.41, sem=door; [1,5]: depth=1.40, sem=door
[2,0]: depth=2.12, sem=sofa; [2,1]: depth=4.90, sem=sofa; [2,2]: depth=4.48, sem=wall; [2,3]: depth=3.51, sem=fireplace; [2,4]: depth=2.43, sem=door; [2,5]: depth=1.40, sem=door
[3,0]: depth=0.47, sem=sofa; [3,1]: depth=0.58, sem=sofa; [3,2]: depth=2.23, sem=sofa; [3,3]: depth=2.96, sem=fireplace; [3,4]: depth=2.41, sem=door; [3,5]: depth=1.40, sem=door
[4,0]: depth=0.47, sem=sofa; [4,1]: depth=0.62, sem=sofa; [4,2]: depth=1.18, sem=floor; [4,3]: depth=1.42, sem=floor; [4,4]: depth=1.41, sem=floor; [4,5]: depth=1.27, sem=door
[5,0]: depth=0.48, sem=sofa; [5,1]: depth=0.63, sem=sofa; [5,2]: depth=0.81, sem=floor; [5,3]: 

### Use `prepare_text_samples_batch` for prompt fomulation

In [2]:
from l2am.inference_api import L2AMActionClassifier
from l2am.dataset_utils import get_or_create_dataset
# 初始化分类器（只需一次）
classifier = L2AMActionClassifier(
    model_checkpoint="data/l2a_bigbird_action_classifier1/checkpoint-461500",
    model_name="google/bigbird-roberta-base",
    hf_cache_dir="data/hf_model_cache",
    max_length=1024,
    num_labels=4,
    device=None
)
classifier.model.model.config.attention_type = "original_full"

[L2AMActionClassifier] Using device: cuda


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[L2AMActionClassifier] Loading weights from pytorch_model.bin: data/l2a_bigbird_action_classifier1/checkpoint-461500/pytorch_model.bin


In [2]:
print("Type of classifier.model.model:")
print(type(classifier.model.model))

print("\nModel config:")
print(classifier.model.model.config)

print("\nHas 'bigbird'?")
print(hasattr(classifier.model.model, 'bigbird'))

print("\nHas 'roberta'?")
print(hasattr(classifier.model.model, 'roberta'))

Type of classifier.model.model:
<class 'transformers.models.big_bird.modeling_big_bird.BigBirdForSequenceClassification'>

Model config:
BigBirdConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google/bigbird-roberta-base",
  "architectures": [
    "BigBirdForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "action_0",
    "1": "action_1",
    "2": "action_2",
    "3": "action_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "action_0": 0,
    "action_1": 1,
    "action_2": 2,
    "action_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blo

In [3]:
# 从 dataset 准备一个样本进行测试，构建prompt示例代码如下：
from l2am.dataset_utils import prepare_text_samples_batch
from datasets import load_dataset, Dataset
data_path = "data/l2am_r2r/episodes_part_0008.json"
raw_ds = load_dataset("json", data_files=data_path, split="train")
# 应用转换
ds = raw_ds.map(
        prepare_text_samples_batch,
        batched=True,
        remove_columns=raw_ds.column_names,
        desc="Building text prompts",
        num_proc=4,  # 并行加速（可选）
        load_from_cache_file=False  # ← 关键！强制重新计算
    )
print(f"Total frames: {len(ds)}")
ds = ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds["train"]
eval_ds = ds["test"]

Building text prompts (num_proc=4): 100%|██████████| 100/100 [00:00<00:00, 113.31 examples/s]

Total frames: 5884





In [2]:
print(train_ds)
print(eval_ds)

Dataset({
    features: ['prompt', 'action'],
    num_rows: 5589
})
Dataset({
    features: ['prompt', 'action'],
    num_rows: 295
})


In [None]:
# 批量预测
frame_ds = train_ds
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
# actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
actions = classifier.predict_batch_fast(prompts=EXAMPLE_PROMPTS, batch_size=128)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

Input ids are automatically padded from 717 to 768 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 727 to 768 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 725 to 768 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 716 to 768 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 713 to 768 to be a multiple of `config.block_size`: 64



Batch Prediction Results:
Sample 0: Predicted Action = 3, Ground Truth Action = 3
Sample 1: Predicted Action = 1, Ground Truth Action = 1
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 1, Ground Truth Action = 1
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 3, Ground Truth Action = 3
Sample 8: Predicted Action = 1, Ground Truth Action = 1
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 3, Ground Truth Action = 3
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 1, Ground Truth Action = 2
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 1, Ground Truth Action = 2
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre

: 

In [4]:
# 批量预测
frame_ds = eval_ds
num_batch = int(len(frame_ds)/1)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")

Attention type 'block_sparse' is not possible if sequence_length: 628 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...



Batch Prediction Results:
Sample 0: Predicted Action = 1, Ground Truth Action = 1
Sample 1: Predicted Action = 1, Ground Truth Action = 1
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 1, Ground Truth Action = 1
Sample 5: Predicted Action = 2, Ground Truth Action = 2
Sample 6: Predicted Action = 2, Ground Truth Action = 2
Sample 7: Predicted Action = 1, Ground Truth Action = 1
Sample 8: Predicted Action = 1, Ground Truth Action = 1
Sample 9: Predicted Action = 3, Ground Truth Action = 3
Sample 10: Predicted Action = 1, Ground Truth Action = 1
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 1, Ground Truth Action = 1
Sample 14: Predicted Action = 2, Ground Truth Action = 2
Sample 15: Predicted Action = 1, Ground Truth Action = 1
Sample 16: Predicted Action = 3, Ground Truth Action = 3
Sample 17: Pre

In [5]:
# 批量预测
frame_ds = train_ds
num_batch = int(len(frame_ds)/10)
EXAMPLE_PROMPTS = [
    frame_ds[i]['prompt'] for i in range(num_batch)
]
GROUND_TRUTH_ACTIONS = [
    frame_ds[i]['action'] for i in range(num_batch)
]
# actions = classifier.predict_batch(prompts=EXAMPLE_PROMPTS)
actions = classifier.predict_batch_fast(prompts=EXAMPLE_PROMPTS, batch_size=128)
# 统计正确率
correct_count = sum([1 for pred, gt in zip(actions, GROUND_TRUTH_ACTIONS) if pred == gt])
total_count = len(GROUND_TRUTH_ACTIONS)
accuracy = correct_count / total_count * 100.0
print("\n" + "="*50)
print("Batch Prediction Results:")
for i, (pred, gt) in enumerate(zip(actions, GROUND_TRUTH_ACTIONS)):
    print(f"Sample {i}: Predicted Action = {pred}, Ground Truth Action = {gt}")
print(f"Accuracy: {accuracy:.2f}% ({correct_count}/{total_count})")


Batch Prediction Results:
Sample 0: Predicted Action = 3, Ground Truth Action = 3
Sample 1: Predicted Action = 1, Ground Truth Action = 1
Sample 2: Predicted Action = 1, Ground Truth Action = 1
Sample 3: Predicted Action = 1, Ground Truth Action = 1
Sample 4: Predicted Action = 1, Ground Truth Action = 1
Sample 5: Predicted Action = 1, Ground Truth Action = 1
Sample 6: Predicted Action = 1, Ground Truth Action = 1
Sample 7: Predicted Action = 3, Ground Truth Action = 3
Sample 8: Predicted Action = 1, Ground Truth Action = 1
Sample 9: Predicted Action = 1, Ground Truth Action = 1
Sample 10: Predicted Action = 3, Ground Truth Action = 3
Sample 11: Predicted Action = 1, Ground Truth Action = 1
Sample 12: Predicted Action = 1, Ground Truth Action = 1
Sample 13: Predicted Action = 2, Ground Truth Action = 2
Sample 14: Predicted Action = 1, Ground Truth Action = 1
Sample 15: Predicted Action = 2, Ground Truth Action = 2
Sample 16: Predicted Action = 1, Ground Truth Action = 1
Sample 17: Pre