# Implements OpenVLA

In [14]:
# Install minimal dependencies (`torch`, `transformers`, `timm`, `tokenizers`, ...)
# > pip install -r https://raw.githubusercontent.com/openvla/openvla/main/requirements-min.txt
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

import torch

# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b", 
    attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True, 
    trust_remote_code=True
).to("cuda:0")

# Grab image input & format prompt
image: Image.Image = get_from_camera(...)
prompt = "In: What action should the robot take to {<INSTRUCTION>}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

# Execute...
robot.act(action, ...)

A new version of the following files was downloaded from https://huggingface.co/openvla/openvla-7b:
- processing_prismatic.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openvla/openvla-7b:
- configuration_prismatic.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openvla/openvla-7b:
- modeling_prismatic.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 3/3 [18:43<00:00, 374.42s/it]


ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

# Explore Dataset

In [None]:
import tensorflow_datasets as tfds

# 예시: 가장 유명한 데이터셋 중 하나인 'fractal20220825_data' (RT-1 데이터)
ds = tfds.load('fractal20220825_data', split='train', data_dir='YOUR_DATA_PATH')
for episode in ds.take(1):
    for step in episode['steps'].take(1):
        print(step['observation']['image'])  # 이미지 데이터 확인
        print(step['action'])               # 액션 값 확인
        print(step['observation']['natural_language_instruction']) # 언어 명령 확인

# RLDS (Robot Learning Dataset)

In [3]:
import tensorflow_datasets as tfds
import numpy as np
from PIL import Image

# 1. 데이터셋 경로 설정 (tfrecord 파일이 들어있는 '상위 폴더' 경로를 넣으세요)
DATA_PATH = "./data/" 

# 2. RLDS 데이터셋 로드
# 해당 경로에 metadata가 포함되어 있어야 작동합니다.
builder = tfds.builder_from_directory(DATA_PATH)
ds = builder.as_dataset(split='train')

# 3. 데이터 확인하기 (첫 번째 에피소드의 첫 번째 스텝)
for episode in ds.take(1):
    steps = list(episode['steps'])
    first_step = steps[0]
    
    # 이미지 데이터 추출
    image = first_step['observation']['image'].numpy()
    instruction = first_step['observation']['natural_language_instruction'].numpy().decode('utf-8')
    action = first_step['action'].numpy()
    
    print(f"Task: {instruction}")
    print(f"Action (Joint velocities/Pose): {action}")
    
    # 이미지 시각화
    display(Image.fromarray(image))

2026-01-07 10:26:38.421542: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-07 10:26:38.556332: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-07 10:26:38.556407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-07 10:26:38.577467: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-07 10:26:38.633800: I tensorflow/core/platform/cpu_feature_guar

NotFoundError: {{function_node __wrapped__IteratorGetNext_output_types_8_device_/job:localhost/replica:0/task:0/device:CPU:0}} data/bridge_dataset-train.tfrecord-00000-of-01024; No such file or directory [Op:IteratorGetNext] name: 

# Make Dataloader

For training, the raw data needs to be converted into a format that is compatible with a data loader. 

In [1]:
import pickle

with open('./data/bridge_dataset_scripted_6_18/2022-12-08_pnp_rigid_objects/2022-12-08_15-22-17/raw/traj_group0/traj0/obs_dict.pkl', 'rb') as f:
    obs = pickle.load(f)

In [2]:
with open('./data/bridge_dataset_scripted_6_18/2022-12-08_pnp_rigid_objects/2022-12-08_15-22-17/raw/traj_group0/traj0/policy_out.pkl', 'rb') as f:
    policy = pickle.load(f)

In [3]:
import os
img_pth = './data/bridge_dataset_scripted_6_18/2022-12-08_pnp_rigid_objects/2022-12-08_15-22-17/raw/traj_group0/traj0/images0'

len(os.listdir(img_pth))

50

# Explore processor

In [1]:
# Install minimal dependencies (`torch`, `transformers`, `timm`, `tokenizers`, ...)
# > pip install -r https://raw.githubusercontent.com/openvla/openvla/main/requirements-min.txt
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

import torch

# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
# vla = AutoModelForVision2Seq.from_pretrained(
#     "openvla/openvla-7b",
#     attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True
# ).to("cuda:0")

# 2. 입력 데이터 준비 (이미지 + 명령어)
image = Image.open("data/bridge_dataset_scripted_6_18/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0/images0/im_0.jpg") 
# 로봇 카메라 이미지
prompt = "In order to pick up the can, the robot should" # Bridge 데이터셋 스타일 프롬프트

# 3. 추론 실행
inputs = processor(prompt, image, return_tensors="pt").to("cuda", dtype=torch.bfloat16)
# action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

# print(f"Predicted Action: {action}") # [x, y, z, roll, pitch, yaw, gripper]

  from .autonotebook import tqdm as notebook_tqdm
2026-01-09 14:48:59.862285: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-09 14:48:59.887553: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-09 14:48:59.887585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-09 14:48:59.888267: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-09 14:48:59.8

In [9]:
import os
import pickle

print(f'image length:{len(os.listdir("./data/bridge_dataset_scripted_6_18/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0/images0"))}')

with open('./data/bridge_dataset_scripted_6_18/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0/obs_dict.pkl', 'rb') as f:
    obs = pickle.load(f)

with open('./data/bridge_dataset_scripted_6_18/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0/policy_out.pkl', 'rb') as f:
    policy = pickle.load(f)

image length:30


In [12]:
len(obs['joint_effort']), obs.keys()

(30,
 dict_keys(['joint_effort', 'qpos', 'qvel', 'full_state', 'state', 'desired_state', 'time_stamp', 'eef_transform', 'high_bound', 'low_bound', 'env_done', 't_get_obs']))

In [16]:
len(policy[-1]['actions']), policy[-1]

(7,
 {'actions': array([-0.00148566, -0.00214222, -0.00269785, -0.00634548,  0.01319404,
          0.02281726, -0.0005469 ])})

In [18]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])