In [1]:
# First, define the training and test set

import random

# Total number of indices
total_indices = 330

# Set random seed for reproducibility
random.seed(0)

GT_index = 15

# Calculate the size of the test set
test_size = int(total_indices * 0.2)

# Create a list of all indices
all_indices = list(range(total_indices))

# Sample the test set indices
test_indices = random.sample(all_indices, test_size)

if not GT_index in test_indices:
    test_indices.append(GT_index)
    
# Get the training set by excluding the test indices
train_indices = [idx for idx in all_indices if idx not in test_indices]

train_indices.sort()
test_indices.sort()

# Print the results
print("Test Set:", test_indices)
print("Train Set:", train_indices)


Test Set: [0, 7, 15, 20, 31, 32, 37, 41, 46, 47, 48, 50, 51, 55, 71, 72, 75, 97, 104, 111, 113, 122, 124, 128, 132, 133, 144, 149, 154, 155, 158, 161, 163, 166, 169, 170, 181, 183, 197, 204, 207, 215, 222, 226, 229, 241, 244, 248, 250, 252, 258, 260, 261, 266, 272, 278, 280, 282, 286, 290, 298, 308, 312, 313, 316, 320, 327]
Train Set: [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 49, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 112, 114, 115, 116, 117, 118, 119, 120, 121, 123, 125, 126, 127, 129, 130, 131, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 150, 151, 152, 153, 156, 157, 159, 160, 162, 164, 165, 167, 168, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 184, 185, 186, 187, 18

In [2]:
# Scaling factors from 0.1x to 1.0x
scaling_factors = [i / 10 for i in range(1, 11)]

# Generate scaled train and test sets
scaled_train_test_sets = []
for scale in scaling_factors:
    # Scale the train and test set sizes
    scaled_train_size = int(len(train_indices) * scale)
    scaled_test_size = int(len(test_indices) * scale)
    
    # Sample scaled train and test sets deterministically
    random.seed(0)  
    scaled_train_indices = random.sample(train_indices, scaled_train_size)
    # scaled_test_indices = random.sample(test_indices, scaled_test_size)
    scaled_test_indices = test_indices  # the same test indices for all train set

    scaled_train_indices.sort()
    scaled_test_indices.sort()
    
    # Store the scaled train and test sets
    scaled_train_test_sets.append((scaled_train_indices, scaled_test_indices))

# Output the scaled train and test sets
for i, (scaled_train, scaled_test) in enumerate(scaled_train_test_sets):
    print(f"Scaling Factor {scaling_factors[i]:.1f}x:")
    print(f"  Train Set ({len(scaled_train)}): {scaled_train}")
    print(f"  Test Set ({len(scaled_test)}): {scaled_test}")
    print()

Scaling Factor 0.1x:
  Train Set (26): [24, 28, 43, 69, 81, 89, 110, 160, 165, 172, 187, 193, 196, 223, 238, 243, 254, 257, 269, 287, 292, 305, 310, 315, 323, 324]
  Test Set (67): [0, 7, 15, 20, 31, 32, 37, 41, 46, 47, 48, 50, 51, 55, 71, 72, 75, 97, 104, 111, 113, 122, 124, 128, 132, 133, 144, 149, 154, 155, 158, 161, 163, 166, 169, 170, 181, 183, 197, 204, 207, 215, 222, 226, 229, 241, 244, 248, 250, 252, 258, 260, 261, 266, 272, 278, 280, 282, 286, 290, 298, 308, 312, 313, 316, 320, 327]

Scaling Factor 0.2x:
  Train Set (52): [18, 22, 24, 28, 29, 43, 45, 66, 69, 81, 83, 89, 96, 98, 102, 109, 110, 137, 139, 147, 150, 160, 165, 167, 172, 177, 179, 187, 192, 193, 194, 196, 202, 217, 223, 231, 238, 243, 254, 256, 257, 269, 271, 287, 289, 292, 301, 305, 310, 315, 323, 324]
  Test Set (67): [0, 7, 15, 20, 31, 32, 37, 41, 46, 47, 48, 50, 51, 55, 71, 72, 75, 97, 104, 111, 113, 122, 124, 128, 132, 133, 144, 149, 154, 155, 158, 161, 163, 166, 169, 170, 181, 183, 197, 204, 207, 215, 222, 226

In [3]:
import os
import re

def check_record_status(logs_path):
    completed_records = set()
    completed_names = set()
    missing_h5_record = set()
    incomplete_records = set()

    # Traverse all sub-folders in logs/rsl_rl
    for subdir in os.listdir(logs_path):
        subdir_path = os.path.join(logs_path, subdir)
        if os.path.isdir(subdir_path):
            # Match prefix name
            match = re.match(r"(Genhumanoid\d+)_", subdir)
            if match:
                prefix_name = match.group(1)
                # Get all timestamp sub-folders
                time_subdirs = [
                    d for d in os.listdir(subdir_path)
                    if os.path.isdir(os.path.join(subdir_path, d))
                ]
                if time_subdirs:
                    # Sort to find the latest folder
                    latest_subdir = sorted(time_subdirs)[-1]
                    latest_subdir_path = os.path.join(subdir_path, latest_subdir)
                    h5py_record_path = os.path.join(latest_subdir_path, "h5py_record")
                    if os.path.exists(h5py_record_path):
                        obs_file = os.path.join(h5py_record_path, "obs_actions_00004.h5")
                        if os.path.exists(obs_file):
                            completed_records.add(prefix_name)
                            completed_names.add(subdir)
                        else:
                            incomplete_records.add(prefix_name)
                    else:
                        missing_h5_record.add(prefix_name)

    # Sort based on the results
    def sort_by_number(prefix_list):
        return sorted(prefix_list, key=lambda x: int(re.search(r"\d+", x).group()))

    return {
        "Completed Records": sort_by_number(list(completed_records)),
        "Completed Records Names": sort_by_number(list(completed_names)),
        "Missing h5py_record": sort_by_number(list(missing_h5_record)),
        "Incomplete Records": sort_by_number(list(incomplete_records))
    }

# Example
logs_path = "/home/liudai/hdd_0/projects/cross_em/data/logs_all_v0/genhumanoid/logs/rsl_rl" # Modify to the actual directory
record_status = check_record_status(logs_path)

# Output the results
print("Completed Records:")
print(record_status["Completed Records"])
print("len:", len(record_status["Completed Records"]))

print("\nFailed Records:")
print(record_status["Missing h5py_record"])
print("\nIncomplete Records:")
print(record_status["Incomplete Records"])


Completed Records:
['Genhumanoid1', 'Genhumanoid2', 'Genhumanoid3', 'Genhumanoid4', 'Genhumanoid5', 'Genhumanoid6', 'Genhumanoid7', 'Genhumanoid8', 'Genhumanoid9', 'Genhumanoid10', 'Genhumanoid11', 'Genhumanoid12', 'Genhumanoid13', 'Genhumanoid14', 'Genhumanoid15', 'Genhumanoid16', 'Genhumanoid17', 'Genhumanoid18', 'Genhumanoid19', 'Genhumanoid20', 'Genhumanoid21', 'Genhumanoid22', 'Genhumanoid23', 'Genhumanoid24', 'Genhumanoid25', 'Genhumanoid26', 'Genhumanoid27', 'Genhumanoid28', 'Genhumanoid29', 'Genhumanoid30', 'Genhumanoid31', 'Genhumanoid32', 'Genhumanoid33', 'Genhumanoid34', 'Genhumanoid35', 'Genhumanoid36', 'Genhumanoid37', 'Genhumanoid38', 'Genhumanoid39', 'Genhumanoid40', 'Genhumanoid41', 'Genhumanoid42', 'Genhumanoid43', 'Genhumanoid44', 'Genhumanoid45', 'Genhumanoid46', 'Genhumanoid47', 'Genhumanoid48', 'Genhumanoid49', 'Genhumanoid50', 'Genhumanoid51', 'Genhumanoid52', 'Genhumanoid53', 'Genhumanoid54', 'Genhumanoid55', 'Genhumanoid56', 'Genhumanoid57', 'Genhumanoid58', 'Ge

In [4]:
# Assuming `record_status` is already generated using the provided code
completed_records = set(record_status["Completed Records"])
completed_folder_names = set(record_status["Completed Records Names"])

# Store folder names for completed and missing records
detailed_results = []

# Verbose mode: Track unavailable jobs for train and test sets
verbose_results = []

# Check for missing records in each test set
results = []
for i, (train_set, test_set) in enumerate(scaled_train_test_sets):
    # Map indices to detailed folder names
    train_set_names = [name for name in completed_folder_names if int(name.split("_")[0].replace("Genhumanoid", "")) in train_set]
    test_set_names = [name for name in completed_folder_names if int(name.split("_")[0].replace("Genhumanoid", "")) in test_set]

    train_set_names = sorted(
        train_set_names,
        key=lambda x: int(x.split("_")[0].replace("Genhumanoid", ""))
    )
    test_set_names = sorted(
        test_set_names,
        key=lambda x: int(x.split("_")[0].replace("Genhumanoid", ""))
    )
    
    # Find missing and completed records for this train and test set
    missing_train_records = set(train_set_names) - completed_folder_names
    missing_test_records = set(test_set_names) - completed_folder_names
    
    # Collect folder names
    completed_folders = [name for name in (test_set_names + train_set_names) if name in completed_folder_names]
    missing_folders = [name for name in (test_set_names + train_set_names) if name not in completed_folder_names]
    
    # Store results for this scaling factor
    results.append({
        "Scaling Factor": scaling_factors[i],
        "Total Train Records": len(train_set_names),
        "Total Test Records": len(test_set_names),
        "Unfinished Train Jobs": len(missing_train_records),
        "Unfinished Test Jobs": len(missing_test_records),
        "Completed Records": len(completed_folders),
        "Missing Records": len(missing_folders),
        "Completed Folder Names": sorted(completed_folders),
        "Missing Folder Names": sorted(missing_folders),
    })

    # Store train and test sets for later use
    detailed_results.append({
        "Scaling Factor": scaling_factors[i],
        "Train Set Names": train_set_names,
        "Test Set Names": test_set_names,
        "Completed Folder Names": completed_folders,
        "Missing Folder Names": missing_folders,
    })

    # Verbose results: Track unavailable train/test jobs
    verbose_results.append({
        "Scaling Factor": scaling_factors[i],
        "Unfinished Train Jobs": len(missing_train_records),
        "Unfinished Test Jobs": len(missing_test_records),
        "Unavailable Train Ratio": f"{len(missing_train_records)}/{len(train_set_names)}",
        "Unavailable Test Ratio": f"{len(missing_test_records)}/{len(test_set_names)}",
    })

# Output results
# for result in results:
#     print(f"Scaling Factor: {result['Scaling Factor']:.1f}x")
#     print(f"  Total Train Records: {result['Total Train Records']}")
#     print(f"  Total Test Records: {result['Total Test Records']}")
#     print(f"  Completed Records: {result['Completed Records']}")
#     print(f"  Missing Records: {result['Missing Records']}")
#     if result["Missing Folder Names"]:
#         print(f"  Missing Folder Names: {', '.join(result['Missing Folder Names'])}")
#     print()

# Output verbose results with train/test availability ratios
for verbose in verbose_results:
    print(f"Scaling Factor: {verbose['Scaling Factor']:.1f}x")
    print(f"  Unfinished Train Jobs: {verbose['Unfinished Train Jobs']} ({verbose['Unavailable Train Ratio']})")
    print(f"  Unfinished Test Jobs: {verbose['Unfinished Test Jobs']} ({verbose['Unavailable Test Ratio']})")
    print()


Scaling Factor: 0.1x
  Unfinished Train Jobs: 0 (0/26)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.2x
  Unfinished Train Jobs: 0 (0/50)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.3x
  Unfinished Train Jobs: 0 (0/75)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.4x
  Unfinished Train Jobs: 0 (0/99)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.5x
  Unfinished Train Jobs: 0 (0/123)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.6x
  Unfinished Train Jobs: 0 (0/148)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.7x
  Unfinished Train Jobs: 0 (0/171)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.8x
  Unfinished Train Jobs: 0 (0/193)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 0.9x
  Unfinished Train Jobs: 0 (0/215)
  Unfinished Test Jobs: 0 (0/60)

Scaling Factor: 1.0x
  Unfinished Train Jobs: 0 (0/241)
  Unfinished Test Jobs: 0 (0/60)



In [5]:
# detailed_results

In [6]:
import os

# Configuration
output_folder = "jobs_scaling_factors"  # Folder to store YAML files
logs_path = "../logs/rsl_rl"  # Logs directory path for status check
yaml_template = """apiVersion: batch/v1
kind: Job
metadata:
  name: {job_name}
  namespace: ucsd-haosulab
spec:
  ttlSecondsAfterFinished: 604800
  template:
    metadata:
      labels:
        nautilus.io/rl: "true"
    spec:
      containers:
        - name: gpu-container
          image: albert01102/cuda12.4.1_ubuntu22.04_embodiment:isaac-v1.1-nodisplay
          command:
            - "/bin/bash"
            - "-c"
          args:
            - |
              cd /bai-fast-vol/code/embodiment-scaling-law && {command}
          resources:
            requests:
              cpu: "16"
              memory: "160Gi"
              nvidia.com/gpu: "1"
            limits:
              cpu: "32"
              memory: "192Gi"
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: dshm
              mountPath: /dev/shm
            - name: bai-fast-vol
              mountPath: /bai-fast-vol
      volumes:
        - name: dshm
          emptyDir:
            medium: Memory
        - name: bai-fast-vol
          persistentVolumeClaim:
            claimName: bai-fast-vol
      restartPolicy: Never
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: nvidia.com/gpu.product
                    operator: In
                    values:
                      - NVIDIA-GeForce-RTX-4090
                      - NVIDIA-GeForce-RTX-3090
                      - NVIDIA-A100-80GB-PCIe-MIG-1g.10gb   # threaded multi-instance GPU
                      - NVIDIA-A100-PCIE-40GB
                      - NVIDIA-A100-80GB-PCIe
                      - NVIDIA-A100-SXM4-80GB
                      - NVIDIA-RTX-A6000  # 10% weaker than RTX 3090
                      - NVIDIA-A40    # 20% weaker than RTX 3090
  backoffLimit: 0
"""

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Generate YAML files for each scaling factor
for i, result in enumerate(detailed_results):  # Use detailed_results instead of scaled_train_test_sets
    train_set = result["Train Set Names"]
    test_set = result["Test Set Names"]
    # print(train_set)
    
    # Use the long detailed folder names for train and test sets
    train_set_str = " ".join(train_set)
    test_set_str = " ".join(test_set)
    
    # Command to execute training for this scaling factor
    command = (
        f"/workspace/isaaclab/isaaclab.sh -p scripts/rsl_rl/run_distillation.py "
        f"--train_set {train_set_str} "
        f"--test_set {test_set_str} "
        f"--model urma "
        f"--exp_name scaling_factor_{result['Scaling Factor']:.1f}_v3 "
        f"--batch_size 256 "
        f"--lr 3e-4 "
        f"--num_workers 0 "
        f"--max_files_in_memory 8 "
        f"--num_epochs 10 "
        f"--gradient_acc_steps 1 > ../logs_distillation/scaling_factor_{result['Scaling Factor']:.1f}_0107"
    )
    
    # Job name
    job_name = f"bai-distillation-scaling-{i+1}-v3"
    
    # Generate YAML content
    yaml_content = yaml_template.format(job_name=job_name, command=command)
    
    # Write to YAML file
    yaml_file = os.path.join(output_folder, f"{job_name}.yaml")
    with open(yaml_file, "w") as f:
        f.write(yaml_content)

print(f"Generated {len(detailed_results)} job YAML files in '{output_folder}'")


Generated 10 job YAML files in 'jobs_scaling_factors'


In [7]:
# Paths for submission and deletion scripts
submission_script = os.path.join(output_folder, "submit_all_jobs.sh")
deletion_script = os.path.join(output_folder, "delete_all_jobs.sh")

# Get all job files generated
job_files = [f for f in os.listdir(output_folder) if f.endswith(".yaml")]

# Generate submission script
with open(submission_script, "w") as f:
    f.write("#!/bin/bash\n\n")
    for job_file in job_files:
        f.write(f"kubectl create -f {os.path.join(output_folder, job_file)}\n")

# Make the submission script executable
os.chmod(submission_script, 0o755)

# Generate deletion script
with open(deletion_script, "w") as f:
    f.write("#!/bin/bash\n\n")
    for job_file in job_files:
        job_name = job_file.replace(".yaml", "")  # Extract job name from the file name
        f.write(f"kubectl delete job {job_name}\n")

# Make the deletion script executable
os.chmod(deletion_script, 0o755)

print(f"Submission script: {submission_script}")
print(f"Deletion script: {deletion_script}")


Submission script: jobs_scaling_factors/submit_all_jobs.sh
Deletion script: jobs_scaling_factors/delete_all_jobs.sh


In [8]:
!tar -cvf jobs_scaling_factors.tar jobs_scaling_factors

jobs_scaling_factors/
jobs_scaling_factors/bai-distillation-scaling-3-v3.yaml
jobs_scaling_factors/gen_distillation_commands.ipynb
jobs_scaling_factors/bai-distillation-scaling-10-v3.yaml
jobs_scaling_factors/submit_all_jobs.sh
jobs_scaling_factors/bai-distillation-scaling-8-v3.yaml
jobs_scaling_factors/bai-distillation-scaling-5-v3.yaml
jobs_scaling_factors/bai-distillation-scaling-4-v3.yaml
jobs_scaling_factors/bai-distillation-scaling-7-v3.yaml
jobs_scaling_factors/.ipynb_checkpoints/
jobs_scaling_factors/.ipynb_checkpoints/bai-distillation-scaling-2-v3-checkpoint.yaml
jobs_scaling_factors/.ipynb_checkpoints/gen_distillation_commands-checkpoint.ipynb
jobs_scaling_factors/.ipynb_checkpoints/bai-distillation-scaling-10-v3-checkpoint.yaml
jobs_scaling_factors/bai-distillation-scaling-1-v3.yaml
jobs_scaling_factors/bai-distillation-scaling-2-v3.yaml
jobs_scaling_factors/delete_all_jobs.sh
jobs_scaling_factors/bai-distillation-scaling-9-v3.yaml
jobs_scaling_factors/bai-distillation-scali