In [1]:
from pathlib import Path
import torch

from lerobot.configs.types import FeatureType
from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.datasets.utils import dataset_to_policy_features
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
from lerobot.policies.smolvla.processor_smolvla import make_smolvla_pre_post_processors

device = "cuda" if torch.cuda.is_available() else "cpu"

device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [2]:
output_dir = Path("outputs/smolvla_so101_finetune_pickplace_hugginface")
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
datasett_id = "lerobot/svla_so101_pickplace"

dataset_meta = LeRobotDatasetMetadata(repo_id=datasett_id)

dataset_meta.total_episodes, dataset_meta.total_frames, dataset_meta.camera_keys

(50, 11939, ['observation.images.up', 'observation.images.side'])

In [None]:
features = dataset_to_policy_features(dataset_meta.features)

features

{'action': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>, shape=(6,)),
 'observation.state': PolicyFeature(type=<FeatureType.STATE: 'STATE'>, shape=(6,)),
 'observation.images.up': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 480, 640)),
 'observation.images.side': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 480, 640))}

In [None]:
output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
input_features = {key: ft for key, ft in features.items() if key not in output_features}

In [None]:
cfg = SmolVLAConfig(
    input_features=input_features,
    output_features=output_features,

    n_obs_steps = 1,
    chunk_size=50,

    freeze_vision_encoder=True,
    train_expert_only=True,
    train_state_proj=True,

    optimizer_lr=1e-4,
    optimizer_weight_decay=1e-10,
    optimizer_grad_clip_norm=10,

    scheduler_warmup_steps=1000,
    scheduler_decay_steps=30000,

    device=device,
)

In [None]:
model_id = "lerobot/smolvla_base"

policy = SmolVLAPolicy.from_pretrained(
    model_id,
    config=cfg
)

preprocessor, postprocessor = make_smolvla_pre_post_processors(cfg, dataset_stats=dataset_meta.stats)

policy.train()
policy.to(device)

Reducing the number of VLM layers to 16 ...


SmolVLAPolicy(
  (model): VLAFlowMatching(
    (vlm_with_expert): SmolVLMWithExpertModel(
      (vlm): SmolVLMForConditionalGeneration(
        (model): SmolVLMModel(
          (vision_model): SmolVLMVisionTransformer(
            (embeddings): SmolVLMVisionEmbeddings(
              (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
              (position_embedding): Embedding(1024, 768)
            )
            (encoder): SmolVLMEncoder(
              (layers): ModuleList(
                (0-11): 12 x SmolVLMEncoderLayer(
                  (self_attn): SmolVLMVisionAttention(
                    (k_proj): Linear(in_features=768, out_features=768, bias=True)
                    (v_proj): Linear(in_features=768, out_features=768, bias=True)
                    (q_proj): Linear(in_features=768, out_features=768, bias=True)
                    (out_proj): Linear(in_features=768, out_features=768, bias=True)
                  )
                  (laye

In [None]:
def make_delta_timestamps(delta_indices: list[int] | None, fps: int) -> list[float]:
    """Конвертирует индексы фреймов в временные метки"""
    if delta_indices is None:
        return [0]
    return [i / fps for i in delta_indices]

In [None]:
delta_timestamps = {
        "action": make_delta_timestamps(
            list(range(cfg.chunk_size)),
            dataset_meta.fps
        ),
    }

delta_timestamps |= {
    k: make_delta_timestamps([0], dataset_meta.fps)
    for k in cfg.image_features
}

Now we load dataset and configuring data_loader

In [None]:
dataset = LeRobotDataset(datasett_id, delta_timestamps=delta_timestamps)

dataset

LeRobotDataset({
    Repository ID: 'lerobot/svla_so101_pickplace',
    Number of selected episodes: '50',
    Number of selected samples: '11939',
    Features: '['action', 'observation.state', 'observation.images.up', 'observation.images.side', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index']',
})',

setup interpreter from config

In [None]:
optimizer = cfg.get_optimizer_preset().build(policy.parameters())

In [None]:
batch_size = 8
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory_device=device
)

Configure Training

In [None]:
epochs = 1000
log_freq = 10
save_freq = 500

In [31]:
step = 0
done = False

while not done:
    for batch in data_loader:
        batch = preprocessor(batch)

        loss, loss_dict = policy.forward(batch)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            policy.parameters(),
            cfg.optimizer_grad_clip_norm
        )

        optimizer.step()
        optimizer.zero_grad()

        if step % log_freq == 0:
            print(f"Step {step}/{epochs} | Loss: {loss.item():.4f}")
            if loss_dict:
                for k, v in loss_dict.items():
                    if isinstance(v, torch.Tensor):
                        if v.numel() == 1:
                            print(f"  {k}: {v.item():.4f}")
                        else:
                            print(f"  {k}: mean={v.mean().item():.4f}, shape={v.shape}")
                    else:
                        print(f"  {k}: {v:.4f}")

        
        step += 1
        if step >= epochs:
            done = True
            break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 