# Build Dataloader

In [8]:
import os
import pickle
import pandas as pd

df = pd.read_csv("./instruction.csv")
idx, row = next(df.iterrows())
idx, row

(0,
 path           ./data/scripted_raw/2022-12-08_pnp_rigid_objec...
 category                            2022-12-08_pnp_rigid_objects
 instruction                                                  NaN
 anno                                                       False
 Name: 0, dtype: object)

In [16]:
os.path.join(row["path"], "images0")

with open(os.path.join(row["path"], "policy_out.pkl"), "rb") as f:
    raw_data = pickle.load(f)

if type(raw_data) is not list:
    raise Exception

assert len(raw_data) == (len(os.listdir(os.path.join(row["path"], "images0"))) - 1)

In [None]:
import pandas as pd

df = pd.read_csv('./instruction.csv', )
df.head()

In [None]:
import os

os.path.exists(df.iloc[0, 0])

In [None]:
for idx, row in df.iterrows():
    if not os.path.exists(row['path']):
        raise Exception

In [None]:
from PIL import Image

TRAJ_PATH = "./data/scripted_raw/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0"
INST = "In order to pick up the can, the robot should"

image = Image.open(os.path.join(TRAJ_PATH, f"images0/im_0.jpg")).convert("RGB")


In [None]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
inputs = processor(INST, image, return_tensors="pt")

In [None]:
inputs['input_ids'].shape, inputs['attention_mask'].shape, inputs['pixel_values'].shape

In [None]:
import pickle

with open(os.path.join(TRAJ_PATH, "policy_out.pkl"), "rb") as f:
    raw_data = pickle.load(f)

In [None]:
raw_data[0]['actions']
raw_data

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model

class Traj:
    def __init__(self, traj_dir):
        self.path = traj_dir
        self.img_dir = os.path.join(traj_dir, "images0")
        self.img_len = len(os.listdir(os.path.join(traj_dir, "images0")))
        with open(os.path.join(traj_dir, "policy_out.pkl"), "rb") as f:
            raw_data = pickle.load(f)
        self.actions = [d['actions'] for d in raw_data]

        assert len(self.actions) == (self.img_len - 1)
    
    def __len__(self):
        return len(self.actions)
    
    def getitem(self, idx):
        return os.path.join(self.img_dir, f"im_{idx}.jpg"), self.actions[idx]
    
    def getitems(self):
        ims = []
        for idx in range(self.img_len - 1):
            im, _ = self.getitem(idx)
            ims.append(im)
        return ims, self.actions

class BridgeDatasetV2(Dataset):
    def __init__(self, traj_dirs, instructions, processor, vla_config):
        self.processor = processor
        self.instructions = instructions
        self.vla_config = vla_config
        self.traj_dirs = traj_dirs
        self.trajs = self.load_trajs()

        self.ims = []
        self.actions = []

        for traj in self.trajs:
            I, A = traj.getitems()
            self.ims.extend(I)
            self.actions.extend(A)
        print(f"initialize BridgeDatasetV2, number of trajectories: {len(self.trajs)}, total sample size: {len(self.actions)}.")

    def load_trajs(self, ):
        trajs = []
        for traj_dir in self.traj_dirs:
            obj = Traj(traj_dir)
            trajs.append(obj)
        return trajs

    def __len__(self):
        return len(self.actions)
    
    def __getitem__(self, idx):
        image = Image.open(self.ims[idx]).convert("RGB")
        inputs = self.processor("", image, return_tensors="pt")

        raw_action = np.array(self.actions[idx], dtype=np.float32)
        bin_indices = np.clip((raw_action + 1.0) / 2.0 * 255, 0, 255).astype(np.int32)

        action_token_ids = torch.tensor(bin_indices + 31000, dtype=torch.long)
        input_ids = inputs["input_ids"].squeeze(0)
        labels = torch.full_like(input_ids, -100)
        labels[-7:] = action_token_ids

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": labels
        }

processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)

# 1. 모델 로드
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

config = LoraConfig(
    r=32,                         # Rank
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Attention 레이어 타겟
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 3. LoRA 모델로 변환
vla = get_peft_model(vla, config)
print(f"trainable parameters: {vla.print_trainable_parameters()}")



2026-01-13 04:19:33.661262: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-13 04:19:33.787156: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-13 04:19:33.787200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-13 04:19:33.805183: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-13 04:19:33.841262: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 33,554,432 || all params: 7,574,791,616 || trainable%: 0.4430
trainable parameters: None


In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model

class Traj:
    def __init__(self, traj_dir, instruction):
        self.path = traj_dir
        self.instruction = instruction
        self.img_dir = os.path.join(traj_dir, "images0")
        self.img_len = len(os.listdir(os.path.join(traj_dir, "images0")))
        with open(os.path.join(traj_dir, "policy_out.pkl"), "rb") as f:
            raw_data = pickle.load(f)
        self.actions = [d['actions'] for d in raw_data]

        assert len(self.actions) == (self.img_len - 1)
    
    def __len__(self):
        return len(self.actions)
    
    def getitem(self, idx):
        return os.path.join(self.img_dir, f"im_{idx}.jpg"), self.instruction
    
    def getitems(self):
        ims = []
        inst = []
        for idx in range(self.img_len - 1):
            i, s = self.getitem(idx)
            ims.append(i)
            inst.append(s)
        return ims, self.actions, inst

class BridgeDatasetV2(Dataset):
    def __init__(self, traj_dirs, instructions, processor, vla_config):
        self.processor = processor
        self.instructions = instructions
        self.vla_config = vla_config
        self.traj_dirs = traj_dirs
        self.trajs = self.load_trajs()

        self.ims = []
        self.actions = []
        self.INST = []

        for traj in self.trajs:
            I, A, inst = traj.getitems()
            self.ims.extend(I)
            self.actions.extend(A)
            self.INST.extend(inst)
        print(f"initialize BridgeDatasetV2, number of trajectories: {len(self.trajs)}, total sample size: {len(self.actions)}.")

    def load_trajs(self, ):
        trajs = []
        for traj_dir, inst in zip(self.traj_dirs, self.instructions):
            obj = Traj(traj_dir, inst)
            trajs.append(obj)
        return trajs

    def __len__(self):
        return len(self.actions)
    
    def __getitem__(self, idx):
        image = Image.open(self.ims[idx]).convert("RGB")
        # 1. padding="max_length"를 제거하고 실제 길이만큼만 가져옵니다.
        inputs = self.processor(self.INST[idx], image, return_tensors="pt")

        raw_action = np.array(self.actions[idx], dtype=np.float32)
        bin_indices = np.clip((raw_action + 1.0) / 2.0 * 255, 0, 255).astype(np.int32)
        action_token_ids = torch.tensor(bin_indices + 31000, dtype=torch.long)

        input_ids = inputs["input_ids"].squeeze(0) # [실제 길이]
        
        # 2. 레이블을 input_ids와 동일한 길이로 만듭니다.
        labels = torch.full_like(input_ids, -100)
        
        # 3. 맨 뒤 7개에 액션 주입 (이때 input_ids 끝에 액션이 바로 붙음)
        labels[-7:] = action_token_ids

        return {
            "input_ids": input_ids,
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": labels
        }
    
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)

# 1. 모델 로드
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

config = LoraConfig(
    r=32,                         # Rank
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Attention 레이어 타겟
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 3. LoRA 모델로 변환
vla = get_peft_model(vla, config)
print(f"trainable parameters: {vla.print_trainable_parameters()}")

2026-01-13 06:50:53.139088: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-13 06:50:53.165954: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-13 06:50:53.165981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-13 06:50:53.166818: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-13 06:50:53.171442: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 33,554,432 || all params: 7,574,791,616 || trainable%: 0.4430
trainable parameters: None


In [8]:
df = pd.read_csv('./instruction.csv', )

train_dataset = BridgeDatasetV2(
    traj_dirs=df['path'].to_list(),
    instructions=df['instruction'].to_list(),
    processor=processor,
    vla_config=vla.config
)
def collate_fn(batch):
    from torch.nn.utils.rnn import pad_sequence
    
    input_ids = [item["input_ids"] for item in batch]
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]
    
    # input_ids 패딩 (tokenizer의 pad_token_id 사용, 보통 0 또는 1)
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    
    # labels 패딩 (학습 무시 값인 -100으로 패딩)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    
    return {
        "input_ids": padded_input_ids,
        "pixel_values": pixel_values,
        "labels": padded_labels
    }

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=16, 
    shuffle=True, 
    collate_fn=collate_fn
)

print(f"DataLoader Length: {len(train_dataset)}")
batch = next(iter(train_dataloader))
print(f"Input IDs shape: {batch['input_ids'].shape}")     # [BS, Seq_Len]
print(f"Pixel Values shape: {batch['pixel_values'].shape}") # [BS, 3, 224, 224]
print(f"Labels shape: {batch['labels'].shape}")           # [BS, 7] (7 action tokens)


initialize BridgeDatasetV2, number of trajectories: 8879, total sample size: 415160.
DataLoader Length: 415160
Input IDs shape: torch.Size([16, 17])
Pixel Values shape: torch.Size([16, 6, 224, 224])
Labels shape: torch.Size([16, 17])


In [15]:
vla.eval()
device = next(vla.parameters()).device


In [17]:
vla.eval()
device = next(vla.parameters()).device

with open(os.path.join(df["path"].to_list()[0], "policy_out.pkl"), "rb") as f:
    raw_data = pickle.load(f)

actions = [d['actions'] for d in raw_data]

im_pth = os.path.join(df["path"].to_list()[0], f"images0/im_{0}.jpg")

image = Image.open(im_pth).convert("RGB")
inputs = processor("In order to pick up the object, the robot should", image, return_tensors="pt", ).to(device, dtype=torch.bfloat16)
input_ids = inputs["input_ids"].squeeze(0)
labels = torch.full_like(input_ids, -100)

# processor("", image, return_tensors="pt")['pixel_values'].size()

raw_action = np.array(actions[0], dtype=np.float32)
bin_indices = np.clip((raw_action + 1.0) / 2.0 * 255, 0, 255).astype(np.int32)
action_token_ids = torch.tensor(bin_indices + 31000, dtype=torch.long)
labels[-7:] = action_token_ids
labels
# inputs['pixel_values'].shape

tensor([ -100,  -100,  -100,  -100,  -100, 31125, 31123, 31126, 31127, 31127,
        31119, 31254])

In [40]:
universal_instructions = [
            "In order to pick up the object, the robot should",
            "To move the object to a new location, the robot must",
            "In order to grasp and relocate the item, the robot should",
            "To manipulate the objects in front of it, the robot must",
            "In order to complete the task of moving the utensils, the robot should"
        ]

# for idx, row in df.iterrows():
#     df.iloc[idx, ][['instruction']] = universal_instructions[idx % 5]

df['instruction'] = [universal_instructions[i % len(universal_instructions)] for i in range(df.shape[0])]

df.to_csv('instruction.csv', encoding='utf-8', index=False)

In [18]:
vla.predict_action(**inputs, unnorm_key="bridge_orig")

RuntimeError: q must be on CUDA

In [7]:
batch['input_ids'].size()

torch.Size([2, 13])