In [2]:
%load_ext autoreload
%autoreload 2

import torch
import pprint
from pathlib import Path

In [3]:
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.policies.factory import make_policy
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy

In [5]:
repo_id = "lerobot/svla_so100_pickplace"

fps = 30

dataset = LeRobotDataset(
    repo_id, 
    episodes=[0],
    delta_timestamps={"action": [i / fps for i in range(10)]}
)

print(f"Dataset loaded: {len(dataset)} frames")

pprint.pprint(dataset.meta.stats["action"])



Dataset loaded: 19631 frames
{'count': array([19631]),
 'max': array([ 72.7734375 , 179.47265625, 164.61914062,  96.59179688,
       123.57421875,  34.9307785 ]),
 'mean': array([ 14.51138199, 146.44867041, 143.31572513,  62.96079529,
        85.83100241,   7.78159506]),
 'min': array([-37.17773438,  48.8671875 ,  40.95703125,   9.66796875,
        56.25      ,   0.        ]),
 'std': array([27.98693199, 34.98953716, 21.46425995, 16.91135693, 12.47836367,
        9.54603304])}


In [6]:
policy_cfg = SmolVLAConfig(
    n_action_steps=10,
    chunk_size=10,
    pretrained_path="Jill111/my_smolvla",
    empty_cameras=0
)

print("Building Policy...")
policy = make_policy(
    cfg=policy_cfg,
    ds_meta=dataset.meta
)



Building Policy...
Reducing the number of VLM layers to 16 ...


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy.to(device)
print(f"device: {device}")

device: cuda


In [8]:
from transformers import AutoTokenizer

tokenizer_path = "HuggingFaceTB/SmolVLM-Instruct"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [9]:
item = dataset[0]
print(item["task"])

encoded_text = tokenizer( 
    item["task"],
    padding="max_length",
    max_length=64,
    truncation=True,
    return_tensors="pt"
)

# print(item.items())
batch = {k: v.unsqueeze(0).to(device) for k, v in item.items() if isinstance(v, torch.Tensor)}

batch["observation.language.tokens"] = encoded_text["input_ids"].to(device)
batch["observation.language.attention_mask"] = encoded_text["attention_mask"].to(device).bool()

print("Text tokenized shape:", batch["observation.language.tokens"].shape)
print("Lang Mask Shape:  ", batch["observation.language.attention_mask"].shape)
print("Batch keys:", batch.keys())
print(f"Image shape: {batch['observation.images.top'].shape}")
print(f"Action shape: {batch['action'].shape}")



Pick up the cube and place it in the box.
Text tokenized shape: torch.Size([1, 64])
Lang Mask Shape:   torch.Size([1, 64])
Batch keys: dict_keys(['observation.images.top', 'observation.images.wrist', 'action', 'observation.state', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index', 'action_is_pad', 'observation.language.tokens', 'observation.language.attention_mask'])
Image shape: torch.Size([1, 3, 480, 640])
Action shape: torch.Size([1, 10, 6])


In [10]:
# data processing

def normalize_data(data, mean, std):
    return (data - mean) / std

action_mean = torch.from_numpy(dataset.meta.stats["action"]["mean"]).float().to(device)
action_std = torch.from_numpy(dataset.meta.stats["action"]["std"]).float().to(device)

# Convert the data in the batch to float 32

batch["action"] = batch["action"].float()
if "observation.state" in batch:
    batch["observation.state"] = batch["observation.state"].float()

batch["action"] = normalize_data(batch["action"], action_mean, action_std)

if "observation.state" in batch:
    state_mean = torch.from_numpy(dataset.meta.stats["observation.state"]["mean"]).float().to(device)
    state_std = torch.from_numpy(dataset.meta.stats["observation.state"]["std"]).float().to(device)
    batch["observation.state"] = normalize_data(batch["observation.state"], state_mean, state_std)



In [11]:
policy.train()

loss, output_dict = policy.forward(batch)

print(f"Loss: {loss.item()}")
print("Output dict keys:", output_dict.keys())

Loss: 0.029459310695528984
Output dict keys: dict_keys(['losses_after_forward', 'losses_after_rm_padding', 'loss'])


In [12]:
policy.eval() 

with torch.no_grad():

    action = policy.select_action(batch)

print(f"Generated Action Shape: {action.shape}") 

print("Action sample:", action[0, :2]) 
if torch.isnan(action).any():
    print("❌ ALERT: Action contains NaN!")
else:
    print("✅ Action looks valid (numerical-wise).")

Generated Action Shape: torch.Size([1, 6])
Action sample: tensor([-0.3268,  0.8102], device='cuda:0')
✅ Action looks valid (numerical-wise).


In [13]:
def to_gpu(data):
    if isinstance(data, dict):
        return {k: to_gpu(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_gpu(i) for i in data]
    elif isinstance(data, tuple):
        return tuple([to_gpu(i) for i in data])
    elif torch.is_tensor(data) and data.device != device:
        return data.to(device)
    else:
        return data

In [14]:
# training

policy = policy.to(device)
batch = to_gpu(batch) 

optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-4)
policy.train()

print("Start dummy training loop...")

loss_list = []
steps = []

for i in range(40):
    
    loss, _ = policy.forward(batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_list.append(loss.item())
    steps.append(i)

    print(f"Step {i}, Loss: {loss.item():.4f}")

Start dummy training loop...
Step 0, Loss: 0.0316
Step 1, Loss: 0.5477
Step 2, Loss: 0.1490
Step 3, Loss: 0.0552
Step 4, Loss: 0.2580
Step 5, Loss: 0.0783
Step 6, Loss: 0.1517
Step 7, Loss: 0.1570
Step 8, Loss: 0.2996
Step 9, Loss: 0.1385
Step 10, Loss: 0.1314
Step 11, Loss: 0.1249
Step 12, Loss: 0.1230
Step 13, Loss: 0.1000
Step 14, Loss: 0.2439
Step 15, Loss: 0.0629
Step 16, Loss: 0.2381
Step 17, Loss: 0.3264
Step 18, Loss: 0.1327
Step 19, Loss: 0.1289
Step 20, Loss: 0.1141
Step 21, Loss: 0.1663
Step 22, Loss: 0.1360
Step 23, Loss: 0.1040
Step 24, Loss: 0.0727
Step 25, Loss: 0.0933
Step 26, Loss: 0.0916
Step 27, Loss: 0.0629
Step 28, Loss: 0.0790
Step 29, Loss: 0.0946
Step 30, Loss: 0.0396
Step 31, Loss: 0.1032
Step 32, Loss: 0.0767
Step 33, Loss: 0.0609
Step 34, Loss: 0.0544
Step 35, Loss: 0.0728
Step 36, Loss: 0.0552
Step 37, Loss: 0.1494
Step 38, Loss: 0.0541
Step 39, Loss: 0.0531


In [15]:
import matplotlib.pyplot as plt

plt.plot(step, loss_list)
plt.xlabel("Training Step")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.grid(True)
plt.show()

NameError: name 'step' is not defined