# Implements OpenVLA

In [None]:
# Install minimal dependencies (`torch`, `transformers`, `timm`, `tokenizers`, ...)
# > pip install -r https://raw.githubusercontent.com/openvla/openvla/main/requirements-min.txt
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

import torch

# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b", 
    attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True, 
    trust_remote_code=True
).to("cuda:0")

# Grab image input & format prompt
image: Image.Image = get_from_camera(...)
prompt = "In: What action should the robot take to {<INSTRUCTION>}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

# Execute...
robot.act(action, ...)

# Explore Dataset

In [None]:
import tensorflow_datasets as tfds

# 예시: 가장 유명한 데이터셋 중 하나인 'fractal20220825_data' (RT-1 데이터)
ds = tfds.load('fractal20220825_data', split='train', data_dir='YOUR_DATA_PATH')
for episode in ds.take(1):
    for step in episode['steps'].take(1):
        print(step['observation']['image'])  # 이미지 데이터 확인
        print(step['action'])               # 액션 값 확인
        print(step['observation']['natural_language_instruction']) # 언어 명령 확인

# RLDS (Robot Learning Dataset)

In [None]:
import tensorflow_datasets as tfds
import numpy as np
from PIL import Image

# 1. 데이터셋 경로 설정 (tfrecord 파일이 들어있는 '상위 폴더' 경로를 넣으세요)
DATA_PATH = "/workspace/openvla-LoRA/data/bridge_dataset-train.tfrecord-00000-of-01024" 

# 2. RLDS 데이터셋 로드
# 해당 경로에 metadata가 포함되어 있어야 작동합니다.
builder = tfds.builder_from_directory(DATA_PATH)
ds = builder.as_dataset(split='train')

# 3. 데이터 확인하기 (첫 번째 에피소드의 첫 번째 스텝)
for episode in ds.take(1):
    steps = list(episode['steps'])
    first_step = steps[0]
    
    # 이미지 데이터 추출
    image = first_step['observation']['image'].numpy()
    instruction = first_step['observation']['natural_language_instruction'].numpy().decode('utf-8')
    action = first_step['action'].numpy()
    
    print(f"Task: {instruction}")
    print(f"Action (Joint velocities/Pose): {action}")
    
    # 이미지 시각화
    display(Image.fromarray(image))