In [None]:
%load_ext autoreload
%autoreload 2

import torch
import pprint
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'dataset'

In [4]:
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.policies.factory import make_policy
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy

In [10]:
repo_id = "lerobot/svla_so100_pickplace"

fps = 30

dataset = LeRobotDataset(
    repo_id, 
    episodes=[0],
    delta_timestamps={"action": [i / fps for i in range(10)]}
)

print(f"Dataset loaded: {len(dataset)} frames")

pprint.pprint(dataset.meta.stats["action"])



Dataset loaded: 19631 frames
{'count': array([19631]),
 'max': array([ 72.7734375 , 179.47265625, 164.61914062,  96.59179688,
       123.57421875,  34.9307785 ]),
 'mean': array([ 14.51138199, 146.44867041, 143.31572513,  62.96079529,
        85.83100241,   7.78159506]),
 'min': array([-37.17773438,  48.8671875 ,  40.95703125,   9.66796875,
        56.25      ,   0.        ]),
 'std': array([27.98693199, 34.98953716, 21.46425995, 16.91135693, 12.47836367,
        9.54603304])}


In [11]:
policy_cfg = SmolVLAConfig(
    n_action_steps=10,
    chunk_size=10,
    pretrained_path="lerobot/smolvla_base",
    empty_cameras=0
)

print("Building Policy...")
policy = make_policy(
    cfg=policy_cfg,
    ds_meta=dataset.meta
)



Building Policy...
Reducing the number of VLM layers to 16 ...


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy.to(device)
print(f"device: {device}")

device: cuda


In [13]:
from transformers import AutoTokenizer

tokenizer_path = "HuggingFaceTB/SmolVLM-Instruct"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [14]:
item = dataset[0]
print(item["task"])

encoded_text = tokenizer( 
    item["task"],
    padding="max_length",
    max_length=64,
    truncation=True,
    return_tensors="pt"
)

# print(item.items())
batch = {k: v.unsqueeze(0).to(device) for k, v in item.items() if isinstance(v, torch.Tensor)}

batch["observation.language.tokens"] = encoded_text["input_ids"].to(device)
batch["observation.language.attention_mask"] = encoded_text["attention_mask"].to(device).bool()

print("Text tokenized shape:", batch["observation.language.tokens"].shape)
print("Lang Mask Shape:  ", batch["observation.language.attention_mask"].shape)
print("Batch keys:", batch.keys())
print(f"Image shape: {batch['observation.images.top'].shape}")
print(f"Action shape: {batch['action'].shape}")



Pick up the cube and place it in the box.
Text tokenized shape: torch.Size([1, 64])
Lang Mask Shape:   torch.Size([1, 64])
Batch keys: dict_keys(['observation.images.top', 'observation.images.wrist', 'action', 'observation.state', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index', 'action_is_pad', 'observation.language.tokens', 'observation.language.attention_mask'])
Image shape: torch.Size([1, 3, 480, 640])
Action shape: torch.Size([1, 10, 6])


In [None]:
# data processing

def normalize_data(data, mean, std):
    return (data - mean) / std

action_mean = torch.from_numpy(dataset.meta.stats["action"]["mean"]).to(device)
action_std = torch.from_numpy(dataset.meta.stats["action"]["std"]).to(device)

batch["action"] = normalize_data(batch["action"], action_mean, action_std)

if "observation.state" in batch:
    state_mean = torch.from_numpy(dataset.meta.stats["observation.state"]["mean"]).float().to(device)
    state_std = torch.from_numpy(dataset.meta.stats["observation.state"]["std"]).float().to(device)
    batch["observation.state"] = normalize_data(batch["observation.state"], state_mean, state_std)


RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float

In [16]:
policy.train()

loss, output_dict = policy.forward(batch)

print(f"Loss: {loss.item()}")
print("Output dict keys:", output_dict.keys())

RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float

In [None]:
policy.eval() # 切换到评估模式

with torch.no_grad():
    # select_action 通常只需要 observation，不需要 ground truth action
    # 但传入整个 batch 也没问题，它会自动挑它需要的 key
    action = policy.select_action(batch)

print(f"Generated Action Shape: {action.shape}") 
# 预期输出: [1, 10, 14] 或者 [1, 14] 
# (取决于 chunk_size 和 n_action_steps 的配置，SmolVLA 通常输出一个 chunk)

# 简单检查一下数值是否“疯了”（比如全是 NaN 或者极大值）
print("Action sample:", action[0, :2]) # 打印前两步动作
if torch.isnan(action).any():
    print("❌ ALERT: Action contains NaN!")
else:
    print("✅ Action looks valid (numerical-wise).")