In [None]:
%%bash
echo "Installing OpenVLA and dependencies..."

echo "Installing core dependencies..."
pip -q install torch>=2.2.0 torchvision>=0.16.0 torchaudio
pip -q install transformers==4.40.1
pip -q install timm==0.9.10
pip -q install tokenizers==0.19.1
pip -q install accelerate
pip -q install einops
pip -q install pillow
pip -q install numpy

echo "Installing LIBERO dependencies..."
pip -q install imageio[ffmpeg]
pip -q install robosuite==1.4.1
pip -q install h5py
pip -q install easydict
pip -q install cloudpickle
pip -q install gym

# Prismatic
pip -q install draccus
pip -q install tensorflow-graphics

pip -q install git+https://github.com/StanfordVL/bddl.git

Installing OpenVLA and dependencies...
Installing core dependencies...
Installing LIBERO dependencies...


In [None]:
%%bash
# Clone OpenVLA repository
if [ ! -d "openvla" ]; then
    echo "Cloning OpenVLA repository..."
    git clone https://github.com/openvla/openvla.git
fi

cd openvla

# Install OpenVLA in editable mode
echo "Installing OpenVLA..."
pip install -e .

# pip -q install -r experiments/robot/libero/libero_requirements.txt

cd ..

# Create symbolic link for prismatic module imports from this script and from within prismatic itself
ln -s openvla/prismatic prismatic

pip -q install --no-deps --force-reinstall git+https://github.com/moojink/dlimp_openvla

Installing OpenVLA...
Obtaining file:///content/openvla
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Collecting dlimp@ git+https://github.com/moojink/dlimp_openvla (from openvla==0.0.3)
  Cloning https://github.com/moojink/dlimp_openvla to /tmp/pip-install-qvicw8vp/dlimp_352c2bf8da824e49aacca1d1fce03a2d
  Resolved https://github.com/moojink/dlimp_openvla to commit 040105d256bd28866cc6620621a3d5f7b6b91b46
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting draccus==0.8.0 (from openvla

  Running command git clone --filter=blob:none --quiet https://github.com/moojink/dlimp_openvla /tmp/pip-install-qvicw8vp/dlimp_352c2bf8da824e49aacca1d1fce03a2d
ERROR: Could not find a version that satisfies the requirement tensorflow==2.15.0 (from openvla) (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)
ERROR: No matching distribution found for tensorflow==2.15.0


In [None]:
%%bash
# Clone and install LIBERO
if [ ! -d "LIBERO" ]; then
    echo "Cloning LIBERO repository..."
    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git
fi
cd LIBERO
pip -q install -e .
cd ..

# Fix the pytorch weights_only issue in init states:
grep -q "weights_only=False" ./LIBERO/libero/libero/benchmark/__init__.py || \
sed -i 's/init_states = torch.load(init_states_path)/init_states = torch.load(init_states_path, weights_only=False)/' \
  ./LIBERO/libero/libero/benchmark/__init__.py

In [None]:
%%bash
# Install flash attention for google colab env

# try if flash attention is already installed
pip show flash-attn &> /dev/null
if [ $? -ne 0 ]; then
	pip install flash-attn
	pip -q install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
fi

# Print flash attention version
pip show flash-attn

Name: flash_attn
Version: 2.8.3
Summary: Flash Attention: Fast and Memory-Efficient Exact Attention
Home-page: https://github.com/Dao-AILab/flash-attention
Author: Tri Dao
Author-email: tri@tridao.me
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: einops, torch
Required-by: 


In [None]:
import os
import torch
import random
import numpy as np

In [None]:
# Config for VLA Evaluation

config = {
    "SEED": 2048,
    "DEVICE": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    # Model configurations
    "MODEL": {
        "FAMILY": "openvla",
        "CENTER_CROP": True,  # Set True if model was fine-tuned with image augmentations
        "SPATIAL": {
            "CHECKPOINT_PATH": "openvla/openvla-7b-finetuned-libero-spatial",
            "TASK": "libero_spatial",
            "MAX_STEPS": 220,  # Longest training demo has 193 steps
        },
        "OBJECT": {
            "CHECKPOINT_PATH": "openvla/openvla-7b-finetuned-libero-object",
            "TASK": "libero_object",
            "MAX_STEPS": 280,  # Longest training demo has 254 steps
        },
        "GOAL": {
            "CHECKPOINT_PATH": "openvla/openvla-7b-finetuned-libero-goal",
            "TASK": "libero_goal",
            "MAX_STEPS": 300,  # Longest training demo has 270 steps
        },
        "10": {
            "CHECKPOINT_PATH": "openvla/openvla-7b-finetuned-libero-10",
            "TASK": "libero_10",
            "MAX_STEPS": 520,  # Longest training demo has 505 steps
        },
        "90": {
            "CHECKPOINT_PATH": "openvla/openvla-7b-finetuned-libero-90",
            "TASK": "libero_90",
            "MAX_STEPS": 400,  # Longest training demo has 373 steps
        },
    },
    # Environment parameters
    "ENV": {
        "NUM_STEPS_WAIT": 10,  # Steps to wait for objects to stabilize in sim
        "NUM_TRIALS_PER_TASK": 5,  # Number of episodes to run per task
        "NUM_TASKS_TO_RUN": 5,  # Only run first 5 tasks (for proof of concept)
        "RESOLUTION": 256,  # Camera resolution for LIBERO environment
        "RESIZE_SIZE": 224,  # Image size expected by OpenVLA model
    },
    # Logging parameters
    "LOGGING": {
        "LOCAL_LOG_DIR": "./results/logs",
        "SAVE_VIDEOS": True,  # Whether to save rollout videos
        "VIDEO_DIR": "./results/rollouts",  # Directory for rollout videos
        "USE_WANDB": False,  # Whether to log to Weights & Biases
        "WANDB_PROJECT": "openvla-libero-eval",
        "WANDB_ENTITY": "your-entity",
    },
}

In [None]:
# %%bash

# if [ ! -d "./downloads/libero_object" ]; then
# 	echo "Downloading libero_object dataset..."
# 	python /content/LIBERO/benchmark_scripts/download_libero_datasets.py --datasets libero_object --download-dir ./downloads/
# fi

# if [ ! -d "./downloads/libero_spatial" ]; then
# 	echo "Downloading libero_spatial dataset..."
# 	python /content/LIBERO/benchmark_scripts/download_libero_datasets.py --datasets libero_spatial --download-dir ./downloads/
# fi

In [None]:
%%bash

printf "y\n" | python /usr/local/lib/python3.12/dist-packages/robosuite/scripts/setup_macros.py

/usr/local/lib/python3.12/dist-packages/robosuite/macros_private.py already exists! 
overwrite? (y/n)
REMOVING
copied /usr/local/lib/python3.12/dist-packages/robosuite/macros.py
to /usr/local/lib/python3.12/dist-packages/robosuite/macros_private.py


# Utils


In [None]:
from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
from transformers import (
    AutoConfig,
    AutoImageProcessor,
    AutoModelForVision2Seq,
    AutoProcessor,
)
from prismatic.extern.hf.processing_prismatic import (
    PrismaticImageProcessor,
    PrismaticProcessor,
)

import json
import math
from PIL import Image
import tensorflow as tf
from libero.libero import benchmark
from libero.libero import get_libero_path
from libero.libero.envs import OffScreenRenderEnv


def setSeed(seed):
    """Set random seed for reproducibility across all libraries."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)


def getModel(task):
    """Load OpenVLA model from checkpoint with proper registration."""
    # Register OpenVLA model components to HuggingFace AutoClasses
    AutoConfig.register("openvla", OpenVLAConfig)
    AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
    AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
    AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)

    checkpoint_path = task["CHECKPOINT_PATH"]

    # Load model with Flash Attention 2 for faster inference
    vla = AutoModelForVision2Seq.from_pretrained(
        checkpoint_path,
        attn_implementation="flash_attention_2",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    ).to(config["DEVICE"])

    # Load dataset statistics for action un-normalization (critical!)
    dataset_statistics_path = os.path.join(checkpoint_path, "dataset_statistics.json")
    if os.path.isfile(dataset_statistics_path):
        with open(dataset_statistics_path, "r") as f:
            norm_stats = json.load(f)
        vla.norm_stats = norm_stats
        print(f"✓ Loaded dataset statistics from {dataset_statistics_path}")
    else:
        print(
            "WARNING: No local dataset_statistics.json file found!\n"
            "This may cause errors when calling predict_action() due to missing unnorm_key."
        )

    return vla


def getProcessor(checkpoint_path):
    """Get the processor (tokenizer + image transform) for the model."""
    return AutoProcessor.from_pretrained(
        checkpoint_path,
        trust_remote_code=True,
    )


def initLIBEROEnv(task):
    """
    Initialize LIBERO benchmark environment for a given task suite.

    :param task: Dictionary containing task configuration with 'TASK' key.
    :return: Initialized LIBERO task suite and number of tasks.
    """
    benchmark_dict = benchmark.get_benchmark_dict()
    task_suite = benchmark_dict[task["TASK"]]()
    print(f"✓ Initialized LIBERO task suite: {task['TASK']} with {task_suite.n_tasks} tasks")
    return task_suite, task_suite.n_tasks


def getLIBEROEnv(suiteTask, resolution):
    """
    Create a LIBERO environment for a specific task.

    :param suiteTask: Task object from LIBERO suite
    :param resolution: Camera resolution (height and width)
    :return: Environment and task description (language instruction)
    """
    task_bddl_file = os.path.join(
        get_libero_path("bddl_files"), suiteTask.problem_folder, suiteTask.bddl_file
    )
    env_args = {
        "bddl_file_name": task_bddl_file,
        "camera_heights": resolution,
        "camera_widths": resolution,
    }
    env = OffScreenRenderEnv(**env_args)
    env.seed(0)  # Seed affects object positions even with fixed initial states
    return env, suiteTask.language


def resizeImage(img, resize_size):
    """
    Resize image using JPEG encode/decode to match training-time preprocessing.
    This follows the same logic used in the Octo/OpenVLA dataloader.

    :param img: Numpy array of shape (H, W, 3)
    :param resize_size: Target size (height, width) tuple
    :return: Resized image as numpy array
    """
    # Encode as JPEG (as done in RLDS dataset builder) then decode
    img = tf.image.encode_jpeg(img)
    img = tf.io.decode_image(img, expand_animations=False, dtype=tf.uint8)
    # Resize using Lanczos3 with antialiasing
    img = tf.image.resize(img, resize_size, method="lanczos3", antialias=True)
    img = tf.cast(tf.clip_by_value(tf.round(img), 0, 255), tf.uint8)
    img = img.numpy()
    return img


def getLIBEROImg(obs):
    """
    Extract and preprocess image from LIBERO observations.
    IMPORTANT: Rotates image 180 degrees to match training preprocessing.

    :param obs: Observation dict from LIBERO environment
    :return: Preprocessed image as numpy array
    """
    resize_size = (config["ENV"]["RESIZE_SIZE"], config["ENV"]["RESIZE_SIZE"])

    # Extract agentview image and rotate 180 degrees (CRITICAL for matching training data!)
    img = obs["agentview_image"][::-1, ::-1]

    # Resize with JPEG encode/decode (matches training preprocessing)
    img = resizeImage(img, resize_size)

    return img


def getLIBEROAction(task, model, obs, task_label, processor):
    """
    Query the VLA model to predict an action.

    :param task: Task config dict
    :param model: Loaded OpenVLA model
    :param obs: Observation dict with 'full_image' key
    :param task_label: Natural language task description
    :param processor: HuggingFace processor
    :return: Predicted action (7-DoF numpy array)
    """
    img = Image.fromarray(obs["full_image"])
    img = img.convert("RGB")

    # Apply center crop if model was trained with image augmentations
    if config["MODEL"]["CENTER_CROP"]:
        batch_size = 1
        crop_scale = 0.9

        # Convert to TF Tensor and record original data type
        img = tf.convert_to_tensor(np.array(img))
        orig_dtype = img.dtype

        # Convert to float32 with values in [0, 1]
        img = tf.image.convert_image_dtype(img, tf.float32)

        # Center crop and resize back to original size
        img = crop_and_resize(img, crop_scale, batch_size)

        # Convert back to original data type
        img = tf.clip_by_value(img, 0, 1)
        img = tf.image.convert_image_dtype(img, orig_dtype, saturate=True)

        # Convert back to PIL Image
        img = Image.fromarray(img.numpy())
        img = img.convert("RGB")

    # Build prompt in OpenVLA format
    prompt = f"In: What action should the robot take to {task_label.lower()}?\nOut:"

    # Process inputs
    inputs = processor(prompt, img).to(config["DEVICE"], dtype=torch.bfloat16)

    # Predict action using the model's built-in method
    # unnorm_key is used to retrieve the correct dataset statistics for un-normalization
    action = model.predict_action(**inputs, unnorm_key=task["TASK"], do_sample=False)

    return action

Do you want to specify a custom path for the dataset folder? (Y/N): ./downloads
Initializing the default config file...
The following information is stored in the config file: /root/.libero/config.yaml
benchmark_root: /content/LIBERO/libero/libero
bddl_files: /content/LIBERO/libero/libero/./bddl_files
init_states: /content/LIBERO/libero/libero/./init_files
datasets: /content/LIBERO/libero/libero/../datasets
assets: /content/LIBERO/libero/libero/./assets


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# Copied over general utility functions
import time
import imageio

DATE = time.strftime("%Y_%m_%d")
DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")

def crop_and_resize(image, crop_scale, batch_size):
    """
    Center-crops an image to have area `crop_scale` * (original image area), and then resizes back
    to original size. We use the same logic seen in the `dlimp` RLDS datasets wrapper to avoid
    distribution shift at test time.

    Args:
        image: TF Tensor of shape (batch_size, H, W, C) or (H, W, C) and datatype tf.float32 with
               values between [0,1].
        crop_scale: The area of the center crop with respect to the original image.
        batch_size: Batch size.
    """
    # Convert from 3D Tensor (H, W, C) to 4D Tensor (batch_size, H, W, C)
    assert image.shape.ndims == 3 or image.shape.ndims == 4
    expanded_dims = False
    if image.shape.ndims == 3:
        image = tf.expand_dims(image, axis=0)
        expanded_dims = True

    # Get height and width of crop
    new_heights = tf.reshape(
        tf.clip_by_value(tf.sqrt(crop_scale), 0, 1), shape=(batch_size,)
    )
    new_widths = tf.reshape(
        tf.clip_by_value(tf.sqrt(crop_scale), 0, 1), shape=(batch_size,)
    )

    # Get bounding box representing crop
    height_offsets = (1 - new_heights) / 2
    width_offsets = (1 - new_widths) / 2
    bounding_boxes = tf.stack(
        [
            height_offsets,
            width_offsets,
            height_offsets + new_heights,
            width_offsets + new_widths,
        ],
        axis=1,
    )

    # Crop and then resize back up
    image = tf.image.crop_and_resize(
        image, bounding_boxes, tf.range(batch_size), (224, 224)
    )

    # Convert back to 3D Tensor (H, W, C)
    if expanded_dims:
        image = image[0]

    return image


def quat2axisangle(quat):
    """
    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
    Copied again from OpenVLA repo: https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_utils.py

    Converts quaternion to axis-angle format.
    Returns a unit vector direction scaled by its angle in radians.

    Args:
        quat (np.array): (x,y,z,w) vec4 float angles

    Returns:
        np.array: (ax,ay,az) axis-angle exponential coordinates
    """
    # clip quaternion
    if quat[3] > 1.0:
        quat[3] = 1.0
    elif quat[3] < -1.0:
        quat[3] = -1.0

    den = np.sqrt(1.0 - quat[3] * quat[3])
    if math.isclose(den, 0.0):
        # This is (close to) a zero degree rotation, immediately return
        return np.zeros(3)

    return (quat[:3] * 2.0 * math.acos(quat[3])) / den


def normalize_gripper_action(action, binarize=True):
    """
    Changes gripper action (last dimension of action vector) from [0,1] to [-1,+1].
    Necessary for some environments (not Bridge) because the dataset wrapper standardizes gripper actions to [0,1].
    Note that unlike the other action dimensions, the gripper action is not normalized to [-1,+1] by default by
    the dataset wrapper.

    Normalization formula: y = 2 * (x - orig_low) / (orig_high - orig_low) - 1
    """
    # Just normalize the last action to [-1,+1].
    orig_low, orig_high = 0.0, 1.0
    action[..., -1] = 2 * (action[..., -1] - orig_low) / (orig_high - orig_low) - 1

    if binarize:
        # Binarize to -1 or +1.
        action[..., -1] = np.sign(action[..., -1])

    return action


def invert_gripper_action(action):
    """
    Flips the sign of the gripper action (last dimension of action vector).
    This is necessary for some environments where -1 = open, +1 = close, since
    the RLDS dataloader aligns gripper actions such that 0 = close, 1 = open.
    """
    action[..., -1] = action[..., -1] * -1.0
    return action

def save_rollout_video(rollout_images, idx, success, task_description, log_file=None):
    """Saves an MP4 replay of an episode."""
    rollout_dir = f"{config["LOGGING"]["VIDEO_DIR"]}/{DATE}"
    os.makedirs(rollout_dir, exist_ok=True)
    processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
    mp4_path = f"{rollout_dir}/{DATE_TIME}--episode={idx}--success={success}--task={processed_task_description}.mp4"
    video_writer = imageio.get_writer(mp4_path, fps=30)
    for img in rollout_images:
        video_writer.append_data(img)
    video_writer.close()
    print(f"Saved rollout MP4 at path {mp4_path}")
    if log_file is not None:
        log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
    return mp4_path

In [None]:
import time

# Initialize logging
DATE = time.strftime("%Y_%m_%d")
DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")

# Set seed for reproducibility
setSeed(config["SEED"])

# Select task suite (change this to test different suites)
TASK = config["MODEL"]["OBJECT"]  # Change to OBJECT, GOAL, etc.

# Load model and processor
print(f"Loading model from {TASK['CHECKPOINT_PATH']}...")
model = getModel(TASK)
processor = getProcessor(TASK["CHECKPOINT_PATH"])

# Initialize LIBERO environment
LIBERO_env, num_tasks = initLIBEROEnv(TASK)
num_tasks = min(config["ENV"]["NUM_TASKS_TO_RUN"], num_tasks) # Limit the number of tasks for faster execution.

# Create logging directory and file
os.makedirs(config["LOGGING"]["LOCAL_LOG_DIR"], exist_ok=True)
run_id = f"EVAL-{TASK['TASK']}-{DATE_TIME}"
log_filepath = os.path.join(config["LOGGING"]["LOCAL_LOG_DIR"], f"{run_id}.txt")
log_file = open(log_filepath, "w")
print(f"✓ Logging to: {log_filepath}")
log_file.write(f"Evaluation Run ID: {run_id}\n")
log_file.write(f"Task Suite: {TASK['TASK']}\n")
log_file.write(f"Checkpoint: {TASK['CHECKPOINT_PATH']}\n")
log_file.write(f"Seed: {config['SEED']}\n")
log_file.write(f"Center Crop: {config['MODEL']['CENTER_CROP']}\n")
log_file.write(f"Num Trials per Task: {config['ENV']['NUM_TRIALS_PER_TASK']}\n")
log_file.write("="*80 + "\n\n")
log_file.flush()

# Initialize W&B logging (optional)
if config["LOGGING"]["USE_WANDB"]:
    import wandb
    wandb.init(
        entity=config["LOGGING"]["WANDB_ENTITY"],
        project=config["LOGGING"]["WANDB_PROJECT"],
        name=run_id,
        config=config,
    )
    print("✓ Weights & Biases logging enabled")

# Verify unnorm_key exists in model's norm_stats
unnorm_key = TASK["TASK"]
if hasattr(model, 'norm_stats'):
    # Handle modified dataset names (e.g., with "_no_noops" suffix)
    if unnorm_key not in model.norm_stats and f"{unnorm_key}_no_noops" in model.norm_stats:
        unnorm_key = f"{unnorm_key}_no_noops"
        print(f"⚠ Using modified unnorm_key: {unnorm_key}")

    if unnorm_key not in model.norm_stats:
        print(f"ERROR: unnorm_key '{unnorm_key}' not found in model.norm_stats!")
        print(f"Available keys: {list(model.norm_stats.keys())}")
        log_file.write(f"ERROR: unnorm_key '{unnorm_key}' not found!\n")
        log_file.close()
        raise ValueError(f"Action un-norm key {unnorm_key} not found in VLA norm_stats!")
    else:
        print(f"✓ Verified unnorm_key: {unnorm_key}")

# Start evaluation loop
print(f"\n{'='*80}")
print(f"Starting evaluation on {num_tasks} tasks from {TASK['TASK']} suite")
print(f"{'='*80}\n")

total_episodes, total_successes = 0, 0

for task_id in range(num_tasks):
    # Get task info and initial states
    task_info = LIBERO_env.get_task(task_id)
    initial_states = LIBERO_env.get_task_init_states(task_id)

    # Initialize environment for this specific task
    env, task_description = getLIBEROEnv(
        task_info, resolution=config["ENV"]["RESOLUTION"]
    )

    print(f"\n{'─'*80}")
    print(f"Task {task_id + 1}/{num_tasks}: {task_description}")
    print(f"{'─'*80}")
    log_file.write(f"\nTask {task_id + 1}/{num_tasks}: {task_description}\n")
    log_file.write(f"{'─'*80}\n")

    # Run multiple episodes for this task
    task_episodes, task_successes = 0, 0

    for episode in range(config["ENV"]["NUM_TRIALS_PER_TASK"]):
        print(f"\n  Episode {episode + 1}/{config['ENV']['NUM_TRIALS_PER_TASK']}...", end=" ")
        log_file.write(f"\n  Episode {episode + 1}: ")

        # Reset environment and set initial state
        env.reset()
        obs = env.set_init_state(initial_states[episode])

        # Episode variables
        t = 0
        replay_images = []
        max_steps = TASK["MAX_STEPS"]
        wait_steps = config["ENV"]["NUM_STEPS_WAIT"]
        done = False

        try:
            while t < max_steps + wait_steps:
                # Wait for objects to stabilize (no-op actions)
                if t < wait_steps:
                    obs, reward, done, info = env.step([0, 0, 0, 0, 0, 0, -1])
                    t += 1
                    continue

                # Get preprocessed image
                img = getLIBEROImg(obs)
                replay_images.append(img)

                # Prepare observation dict (includes proprio state for reference)
                observation = {
                    "full_image": img,
                    "state": np.concatenate(
                        (
                            obs["robot0_eef_pos"],
                            quat2axisangle(obs["robot0_eef_quat"]),
                            obs["robot0_gripper_qpos"],
                        )
                    ),
                }

                # Query model for action
                action = getLIBEROAction(
                    TASK, model, observation, task_description, processor
                )

                # Normalize gripper action [0,1] -> [-1,+1] and binarize
                action = normalize_gripper_action(action, binarize=True)

                # Invert gripper action sign (OpenVLA convention: 0=close, 1=open)
                action = invert_gripper_action(action)

                # Execute action in environment
                obs, reward, done, info = env.step(action.tolist())

                if done:
                    task_successes += 1
                    total_successes += 1
                    break

                t += 1

        except Exception as e:
            print(f"❌ Exception: {e}")
            log_file.write(f"Exception: {e}\n")

        # Update counters
        task_episodes += 1
        total_episodes += 1

        # Save rollout video
        if config["LOGGING"]["SAVE_VIDEOS"]:
            save_rollout_video(
                replay_images,
                total_episodes,
                success=done,
                task_description=task_description,
                log_file=log_file,
            )

        # Log episode result
        result_emoji = "✓" if done else "✗"
        print(f"{result_emoji} {'Success' if done else 'Failed'}")
        log_file.write(f"{'Success' if done else 'Failed'}\n")
        log_file.flush()

    # Log task summary
    task_success_rate = float(task_successes) / float(task_episodes)
    total_success_rate = float(total_successes) / float(total_episodes)

    print(f"\n  Task Success Rate: {task_success_rate:.1%} ({task_successes}/{task_episodes})")
    print(f"  Total Success Rate: {total_success_rate:.1%} ({total_successes}/{total_episodes})")

    log_file.write(f"\n  Task Success Rate: {task_success_rate:.1%} ({task_successes}/{task_episodes})\n")
    log_file.write(f"  Total Success Rate: {total_success_rate:.1%} ({total_successes}/{total_episodes})\n")
    log_file.flush()

    # Log to W&B
    if config["LOGGING"]["USE_WANDB"]:
        wandb.log({
            f"success_rate/{task_description}": task_success_rate,
            f"num_episodes/{task_description}": task_episodes,
        })

# Final summary
print(f"\n{'='*80}")
print(f"EVALUATION COMPLETE")
print(f"{'='*80}")
print(f"Total Episodes: {total_episodes}")
print(f"Total Successes: {total_successes}")
print(f"Overall Success Rate: {total_success_rate:.1%}")
print(f"{'='*80}\n")

log_file.write(f"\n{'='*80}\n")
log_file.write(f"EVALUATION COMPLETE\n")
log_file.write(f"{'='*80}\n")
log_file.write(f"Total Episodes: {total_episodes}\n")
log_file.write(f"Total Successes: {total_successes}\n")
log_file.write(f"Overall Success Rate: {total_success_rate:.1%}\n")
log_file.write(f"{'='*80}\n")
log_file.close()

print(f"✓ Log file saved to: {log_filepath}")

# Push final metrics to W&B
if config["LOGGING"]["USE_WANDB"]:
    wandb.log({
        "success_rate/total": total_success_rate,
        "num_episodes/total": total_episodes,
    })
    wandb.save(log_filepath)
    print("✓ Metrics pushed to Weights & Biases")
    wandb.finish()

Loading model from openvla/openvla-7b-finetuned-libero-object...


config.json: 0.00B [00:00, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

This may cause errors when calling predict_action() due to missing unnorm_key.


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

[info] using task orders [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
✓ Initialized LIBERO task suite: libero_object with 10 tasks
✓ Logging to: ./results/logs/EVAL-libero_object-2026_01_13-18_18_57.txt
✓ Verified unnorm_key: libero_object

Starting evaluation on 5 tasks from libero_object suite


────────────────────────────────────────────────────────────────────────────────
Task 1/5: pick up the alphabet soup and place it in the basket
────────────────────────────────────────────────────────────────────────────────

  Episode 1/5... Saved rollout MP4 at path ./results/rollouts/2026_01_13/2026_01_13-18_18_57--episode=1--success=False--task=pick_up_the_alphabet_soup_and_place_it_in_the_bask.mp4
✗ Failed

  Episode 2/5... Saved rollout MP4 at path ./results/rollouts/2026_01_13/2026_01_13-18_18_57--episode=2--success=True--task=pick_up_the_alphabet_soup_and_place_it_in_the_bask.mp4
✓ Success

  Episode 3/5... Saved rollout MP4 at path ./results/rollouts/2026_01_13/2026_01_13-18_18_57--episode=3--succ