# Build Dataloader

In [1]:
import pandas as pd

df = pd.read_csv('./instruction.csv', )
df.head()

Unnamed: 0,path,category,instruction,anno
0,./data/scripted_raw/2022-12-08_pnp_rigid_objec...,2022-12-08_pnp_rigid_objects,,False
1,./data/scripted_raw/2022-12-08_pnp_rigid_objec...,2022-12-08_pnp_rigid_objects,,False
2,./data/scripted_raw/2022-12-08_pnp_rigid_objec...,2022-12-08_pnp_rigid_objects,,False
3,./data/scripted_raw/2022-12-08_pnp_rigid_objec...,2022-12-08_pnp_rigid_objects,,False
4,./data/scripted_raw/2022-12-08_pnp_rigid_objec...,2022-12-08_pnp_rigid_objects,,False


In [3]:
import os

os.path.exists(df.iloc[0, 0])

True

In [8]:
for idx, row in df.iterrows():
    if not os.path.exists(row['path']):
        raise Exception

In [None]:
from PIL import Image

TRAJ_PATH = "./data/scripted_raw/sweep_12-03/2022-12-04_14-56-20/raw/traj_group0/traj0"
INST = "In order to pick up the can, the robot should"

image = Image.open(os.path.join(TRAJ_PATH, f"images0/im_0.jpg")).convert("RGB")


In [12]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
inputs = processor(INST, image, return_tensors="pt")



In [14]:
inputs['input_ids'].shape, inputs['attention_mask'].shape, inputs['pixel_values'].shape

(torch.Size([1, 12]), torch.Size([1, 12]), torch.Size([1, 6, 224, 224]))

In [15]:
import pickle

with open(os.path.join(TRAJ_PATH, "policy_out.pkl"), "rb") as f:
    raw_data = pickle.load(f)

In [18]:
raw_data[0]['actions']
raw_data

[{'actions': array([-0.02172307,  0.04577763, -0.00310825, -0.00611765, -0.02040617,
         -0.19583636,  0.99728119])},
 {'actions': array([-0.01741809,  0.05146871, -0.00563831, -0.01201281, -0.04797513,
         -0.19332831,  0.98917208])},
 {'actions': array([-0.01222756,  0.04357178, -0.00229091,  0.00568858, -0.04464969,
         -0.14133196,  1.        ])},
 {'actions': array([-0.02752886,  0.0356793 ,  0.01062104,  0.01323281, -0.01837164,
         -0.12258763,  1.        ])},
 {'actions': array([-0.00523159,  0.0306144 ,  0.00577714,  0.02011717, -0.00497684,
         -0.09514776,  1.        ])},
 {'actions': array([-8.97468404e-03,  1.92724973e-02, -4.68816489e-04,  1.03511757e-02,
          1.21088095e-02, -7.63940667e-02,  9.97315548e-01])},
 {'actions': array([-1.03006529e-02,  1.77663997e-02,  1.90194032e-04,  1.11419430e-02,
          8.67873884e-03, -5.00243020e-02,  9.97342255e-01])},
 {'actions': array([-9.05113337e-03,  7.17515094e-03,  3.28887577e-04,  3.42727377e

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model

class Traj:
    def __init__(self, traj_dir):
        self.path = traj_dir
        self.img_dir = os.path.join(traj_dir, "images0")
        self.img_len = len(os.listdir(os.path.join(traj_dir, "images0")))
        with open(os.path.join(traj_dir, "policy_out.pkl"), "rb") as f:
            raw_data = pickle.load(f)
        self.actions = [d['actions'] for d in raw_data]

        assert len(self.actions) == (self.img_len - 1)
    
    def __len__(self):
        return len(self.actions)
    
    def getitem(self, idx):
        return Image.open(os.path.join(self.img_dir, f"im_{idx}.jpg")).convert("RGB"), self.actions[idx]
    
    def getitems(self):
        ims = []
        for idx in range(self.img_len - 1):
            im, _ = self.getitem(idx)
            ims.append(im)
        return ims, self.actions

class BridgeDatasetV2(Dataset):
    def __init__(self, traj_dirs, instructions, processor, vla_config):
        self.processor = processor
        self.instructions = instructions
        self.vla_config = vla_config
        self.traj_dirs = traj_dirs
        self.trajs, self.total_len = self.load_trajs()

        self.ims = []
        self.actions = []

        for traj in self.trajs:
            I, A = traj.getitems()
            self.ims.extend(I)
            self.actions.extend(A)

    def load_trajs(self, ):
        trajs = []
        cnt = 0
        for traj_dir in self.traj_dirs:
            obj = Traj(traj_dir)
            trajs.append(obj)
            cnt += len(obj)
        return trajs, cnt

    def __len__(self):
        return len(self.actions)
    
    def __getitem__(self, idx):
        image = self.ims[idx]
        inputs = self.processor("", image, return_tensors="pt")

        raw_action = np.array(self.actions[idx], dtype=np.float32)
        bin_indices = np.clip((raw_action + 1.0) / 2.0 * 255, 0, 255).astype(np.int32)

        action_token_ids = torch.tensor(bin_indices + 31000, dtype=torch.long)
        input_ids = inputs["input_ids"].squeeze(0)
        labels = torch.full_like(input_ids, -100)
        labels[-7:] = action_token_ids

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": labels
        }

processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)

# 1. 모델 로드
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

config = LoraConfig(
    r=32,                         # Rank
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Attention 레이어 타겟
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 3. LoRA 모델로 변환
vla = get_peft_model(vla, config)
print(f"trainable parameters: {vla.print_trainable_parameters()}")

df = pd.read_csv('./instruction.csv', )

train_dataset = BridgeDatasetV2(
    traj_dirs=df['path'].to_list(),
    instructions=df['instruction'].to_list(),
    processor=processor,
    vla_config=vla.config
)
def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["input_ids"] for item in batch]),
        "pixel_values": torch.stack([item["pixel_values"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch])
    }

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=2, 
    shuffle=True, 
    collate_fn=collate_fn
)

print(f"DataLoader Length: {len(train_dataset)}")
batch = next(iter(train_dataloader))
print(f"Input IDs shape: {batch['input_ids'].shape}")     # [BS, Seq_Len]
print(f"Pixel Values shape: {batch['pixel_values'].shape}") # [BS, 3, 224, 224]
print(f"Labels shape: {batch['labels'].shape}")           # [BS, 7] (7 action tokens)


  from .autonotebook import tqdm as notebook_tqdm
2026-01-12 14:41:04.122584: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-12 14:41:04.147860: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-12 14:41:04.147890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-12 14:41:04.148576: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-12 14:41:04.1

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.