In [1]:
# First, define the training and test set

import random

# Total number of indices
total_indices = 308

# Set random seed for reproducibility
random.seed(0)

# Calculate the size of the test set
test_size = int(total_indices * 0.2)

# Create a list of all indices
all_indices = list(range(total_indices))

# Sample the test set indices
test_indices = random.sample(all_indices, test_size)

# Get the training set by excluding the test indices
train_indices = [idx for idx in all_indices if idx not in test_indices]

# Print the results
print("Test Set:", test_indices)
print("Train Set:", train_indices)


Test Set: [197, 215, 20, 132, 261, 248, 207, 155, 244, 183, 298, 111, 258, 71, 144, 48, 128, 272, 75, 158, 50, 37, 169, 241, 286, 51, 181, 222, 161, 104, 282, 226, 266, 133, 31, 280, 7, 47, 204, 0, 252, 170, 124, 166, 32, 97, 290, 113, 122, 72, 278, 229, 46, 41, 163, 260, 250, 55, 154, 149, 63]
Train Set: [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 49, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 112, 114, 115, 116, 117, 118, 119, 120, 121, 123, 125, 126, 127, 129, 130, 131, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 150, 151, 152, 153, 156, 157, 159, 160, 162, 164, 165, 167, 168, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 19

In [2]:
# Scaling factors from 0.1x to 1.0x
scaling_factors = [i / 10 for i in range(1, 11)]

# Generate scaled train and test sets
scaled_train_test_sets = []
for scale in scaling_factors:
    # Scale the train and test set sizes
    scaled_train_size = int(len(train_indices) * scale)
    scaled_test_size = int(len(test_indices) * scale)
    
    # Sample scaled train and test sets deterministically
    random.seed(int(scale * 100))  # Change seed based on scale for reproducibility
    scaled_train_indices = random.sample(train_indices, scaled_train_size)
    # scaled_test_indices = random.sample(test_indices, scaled_test_size)
    scaled_test_indices = test_indices  # the same test indices for all train set
    
    # Store the scaled train and test sets
    scaled_train_test_sets.append((scaled_train_indices, scaled_test_indices))

# Output the scaled train and test sets
for i, (scaled_train, scaled_test) in enumerate(scaled_train_test_sets):
    print(f"Scaling Factor {scaling_factors[i]:.1f}x:")
    print(f"  Train Set ({len(scaled_train)}): {scaled_train}")
    print(f"  Test Set ({len(scaled_test)}): {scaled_test}")
    print()

Scaling Factor 0.1x:
  Train Set (24): [184, 10, 135, 151, 185, 4, 66, 145, 259, 153, 263, 88, 208, 257, 53, 306, 167, 297, 101, 22, 80, 235, 112, 13]
  Test Set (61): [197, 215, 20, 132, 261, 248, 207, 155, 244, 183, 298, 111, 258, 71, 144, 48, 128, 272, 75, 158, 50, 37, 169, 241, 286, 51, 181, 222, 161, 104, 282, 226, 266, 133, 31, 280, 7, 47, 204, 0, 252, 170, 124, 166, 32, 97, 290, 113, 122, 72, 278, 229, 46, 41, 163, 260, 250, 55, 154, 149, 63]

Scaling Factor 0.2x:
  Train Set (49): [291, 230, 217, 249, 242, 283, 307, 45, 83, 213, 201, 271, 301, 28, 279, 101, 184, 56, 8, 129, 127, 22, 29, 38, 99, 148, 186, 141, 285, 67, 65, 277, 199, 216, 103, 102, 135, 26, 268, 165, 256, 153, 126, 23, 66, 289, 79, 10, 270]
  Test Set (61): [197, 215, 20, 132, 261, 248, 207, 155, 244, 183, 298, 111, 258, 71, 144, 48, 128, 272, 75, 158, 50, 37, 169, 241, 286, 51, 181, 222, 161, 104, 282, 226, 266, 133, 31, 280, 7, 47, 204, 0, 252, 170, 124, 166, 32, 97, 290, 113, 122, 72, 278, 229, 46, 41, 163, 26

In [3]:
import os
import re

def check_record_status(logs_path):
    completed_records = set()
    completed_names = set()
    missing_h5_record = set()
    incomplete_records = set()
    
    # traverse all sub-folders in logs/rsl_rl
    for subdir in os.listdir(logs_path):
        subdir_path = os.path.join(logs_path, subdir)
        if os.path.isdir(subdir_path):
            # Traverse  the latest folder
            match = re.match(r"(Gendog\d+)_", subdir)
            # import ipdb; ipdb.set_trace()
            if match:
                prefix_name = match.group(1)
                # Traverse sub-folders of all time stamps
                time_subdirs = [d for d in os.listdir(subdir_path) if os.path.isdir(os.path.join(subdir_path, d))]
                record_found = False
                for time_subdir in time_subdirs:
                    time_subdir_path = os.path.join(subdir_path, time_subdir)
                    h5py_record_path = os.path.join(time_subdir_path, "h5py_record")
                    if os.path.exists(h5py_record_path):
                        obs_file = os.path.join(h5py_record_path, "obs_actions_00002.h5")
                        if os.path.exists(obs_file):
                            completed_records.add(prefix_name)
                            completed_names.add(subdir)
                            record_found = True
                            break

                if not record_found:
                    if not any(os.path.exists(os.path.join(os.path.join(subdir_path, ts), "h5py_record")) for ts in time_subdirs):
                        missing_h5_record.add(prefix_name)
                    else:
                        incomplete_records.add(prefix_name)

    # Sort based on the results
    def sort_by_number(prefix_list):
        return sorted(prefix_list, key=lambda x: int(re.search(r"\d+", x).group()))

    return {
        "Completed Records": sort_by_number(list(completed_records)),
        "Completed Records Names": sort_by_number(list(completed_names)),
        "Missing h5py_record": sort_by_number(list(missing_h5_record)),
        "Incomplete Records": sort_by_number(list(incomplete_records))
    }

# Example
logs_path = "../logs/rsl_rl"  # Modify to the actual directory
record_status = check_record_status(logs_path)

# Output the results
print("Completed Records:")
print(record_status["Completed Records"])
print("len:", len(record_status["Completed Records"]))

print("\nFailed Records:")
print(record_status["Missing h5py_record"])
print("\nIncomplete Records:")
print(record_status["Incomplete Records"])


Completed Records:
['Gendog0', 'Gendog1', 'Gendog2', 'Gendog3', 'Gendog4', 'Gendog5', 'Gendog6', 'Gendog7', 'Gendog8', 'Gendog9', 'Gendog10', 'Gendog11', 'Gendog12', 'Gendog13', 'Gendog14', 'Gendog15', 'Gendog16', 'Gendog17', 'Gendog18', 'Gendog19', 'Gendog20', 'Gendog21', 'Gendog22', 'Gendog23', 'Gendog24', 'Gendog25', 'Gendog26', 'Gendog27', 'Gendog28', 'Gendog29', 'Gendog30', 'Gendog31', 'Gendog32', 'Gendog33', 'Gendog34', 'Gendog35', 'Gendog36', 'Gendog37', 'Gendog38', 'Gendog39', 'Gendog40', 'Gendog41', 'Gendog42', 'Gendog43', 'Gendog44', 'Gendog45', 'Gendog46', 'Gendog47', 'Gendog48', 'Gendog49', 'Gendog54', 'Gendog55', 'Gendog56', 'Gendog57', 'Gendog58', 'Gendog59', 'Gendog60', 'Gendog61', 'Gendog63', 'Gendog64', 'Gendog66', 'Gendog67', 'Gendog68', 'Gendog70', 'Gendog71', 'Gendog72', 'Gendog74', 'Gendog75', 'Gendog76', 'Gendog78', 'Gendog79', 'Gendog80', 'Gendog82', 'Gendog83', 'Gendog84', 'Gendog85', 'Gendog86', 'Gendog87', 'Gendog88', 'Gendog89', 'Gendog90', 'Gendog91', 'Gendo

In [4]:
# Assuming `record_status` is already generated using the provided code
completed_records = set(record_status["Completed Records"])
completed_folder_names = record_status["Completed Records Names"]

# Store folder names for completed and missing records
detailed_results = []

# Check for missing records in each test set
results = []
for i, (train_set, test_set) in enumerate(scaled_train_test_sets):
    # Map indices to detailed folder names
    train_set_names = [name for name in completed_folder_names if int(name.split("_")[0].replace("Gendog", "")) in train_set]
    test_set_names = [name for name in completed_folder_names if int(name.split("_")[0].replace("Gendog", "")) in test_set]
    
    # Find missing and completed records for this test set
    missing_records = set(test_set_names) - completed_records
    completed_in_test = set(test_set_names) & completed_records
    
    # Collect folder names
    completed_folders = [name for name in test_set_names if name in completed_records]
    missing_folders = [name for name in test_set_names if name not in completed_records]
    
    # Store results for this scaling factor
    results.append({
        "Scaling Factor": scaling_factors[i],
        "Total Test Records": len(test_set_names),
        "Completed Records": len(completed_folders),
        "Missing Records": len(missing_folders),
        "Completed Folder Names": sorted(completed_folders),
        "Missing Folder Names": sorted(missing_folders),
    })

    # Store train and test sets for later use
    detailed_results.append({
        "Scaling Factor": scaling_factors[i],
        "Train Set Names": train_set_names,  # Store train set folder names
        "Test Set Names": test_set_names,    # Store test set folder names
        "Completed Folder Names": completed_folders,
        "Missing Folder Names": missing_folders,
    })

# Output results
for result in results:
    print(f"Scaling Factor: {result['Scaling Factor']:.1f}x")
    print(f"  Total Test Records: {result['Total Test Records']}")
    print(f"  Completed Records: {result['Completed Records']}")
    print(f"  Missing Records: {result['Missing Records']}")
    if result["Missing Folder Names"]:
        print(f"  Missing Folder Names: {', '.join(result['Missing Folder Names'])}")
    print()


Scaling Factor: 0.1x
  Total Test Records: 48
  Completed Records: 0
  Missing Records: 48
  Missing Folder Names: Gendog0_gendog__KneeNum_fl0_fr0_rl0_rr0__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_1_0, Gendog104_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl1_rr0_0_8__Geo_lengthen_thigh_0_8, Gendog111_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_1_2, Gendog113_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_lengthen_thigh_1_6, Gendog124_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl1_fr0_rl0_rr0_1_2__Geo_lengthen_thigh_1_6, Gendog128_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl1_fr0_rl0_rr0_1_2__Geo_lengthen_calf_1_6, Gendog132_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl1_fr0_rl0_rr0_0_8__Geo_scale_all_1_0, Gendog144_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl1_fr1_rl0_rr0_1_2__Geo_scale_all_1_2, Gendog170_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl0_fr0_rl1_rr1_1

In [5]:
detailed_results

[{'Scaling Factor': 0.1,
  'Train Set Names': ['Gendog4_gendog__KneeNum_fl0_fr0_rl0_rr0__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_lengthen_thigh_1_2',
   'Gendog10_gendog__KneeNum_fl0_fr0_rl0_rr0__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_lengthen_calf_0_4',
   'Gendog13_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_0_8',
   'Gendog22_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl0_rr0_1_2__Geo_scale_all_1_0',
   'Gendog66_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl1_rr1_1_2__Geo_scale_all_1_0',
   'Gendog80_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl1_rr1_0_8__Geo_lengthen_thigh_1_6',
   'Gendog88_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl1_rr0_1_2__Geo_scale_all_1_0',
   'Gendog101_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl1_rr0_0_8__Geo_scale_all_0_8',
   'Gendog112_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_0_8',
   'Gendog135_

In [6]:
import os

# Configuration
output_folder = "jobs_scaling_factors"  # Folder to store YAML files
logs_path = "../logs/rsl_rl"  # Logs directory path for status check
yaml_template = """apiVersion: batch/v1
kind: Job
metadata:
  name: {job_name}
  namespace: ucsd-haosulab
spec:
  ttlSecondsAfterFinished: 604800
  template:
    metadata:
      labels:
        nautilus.io/rl: "true"
    spec:
      containers:
        - name: gpu-container
          image: albert01102/cuda12.4.1_ubuntu22.04_embodiment:isaac-v1.1-nodisplay
          command:
            - "/bin/bash"
            - "-c"
          args:
            - |
              cd /bai-fast-vol/code/embodiment-scaling-law && {command}
          resources:
            requests:
              cpu: "16"
              memory: "64Gi"
              nvidia.com/gpu: "1"
            limits:
              cpu: "16"
              memory: "64Gi"
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: dshm
              mountPath: /dev/shm
            - name: bai-fast-vol
              mountPath: /bai-fast-vol
      volumes:
        - name: dshm
          emptyDir:
            medium: Memory
        - name: bai-fast-vol
          persistentVolumeClaim:
            claimName: bai-fast-vol
      restartPolicy: Never
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: nvidia.com/gpu.product
                    operator: In
                    values:
                      - NVIDIA-GeForce-RTX-4090
                      - NVIDIA-GeForce-RTX-3090
                      - NVIDIA-RTX-A6000
                      - NVIDIA-A10
  backoffLimit: 0
"""

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Generate YAML files for each scaling factor
for i, result in enumerate(detailed_results):  # Use detailed_results instead of scaled_train_test_sets
    train_set = result["Train Set Names"]
    test_set = result["Test Set Names"]
    print(train_set)
    
    # Use the long detailed folder names for train and test sets
    train_set_str = " ".join(train_set)
    test_set_str = " ".join(test_set)
    
    # Command to execute training for this scaling factor
    command = (
        f"/workspace/isaaclab/isaaclab.sh -p scripts/rsl_rl/run_distillation.py "
        f"--train_set {train_set_str} "
        f"--test_set {test_set_str} "
        f"--model urma "
        f"--exp_name scaling_factor_{result['Scaling Factor']:.1f} "
        f"--batch_size 512 "
        f"--lr 3e-4 "
        f"--num_workers 16 "
        f"--num_epochs 50 "
        f"--gradient_acc_steps 2"
    )
    
    # Job name
    job_name = f"bai-distillation-scaling-{i+1}"
    
    # Generate YAML content
    yaml_content = yaml_template.format(job_name=job_name, command=command)
    
    # Write to YAML file
    yaml_file = os.path.join(output_folder, f"{job_name}.yaml")
    with open(yaml_file, "w") as f:
        f.write(yaml_content)

print(f"Generated {len(detailed_results)} job YAML files in '{output_folder}'")


['Gendog4_gendog__KneeNum_fl0_fr0_rl0_rr0__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_lengthen_thigh_1_2', 'Gendog10_gendog__KneeNum_fl0_fr0_rl0_rr0__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_lengthen_calf_0_4', 'Gendog13_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_0_8', 'Gendog22_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl0_rr0_1_2__Geo_scale_all_1_0', 'Gendog66_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl1_rr1_1_2__Geo_scale_all_1_0', 'Gendog80_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl0_fr0_rl1_rr1_0_8__Geo_lengthen_thigh_1_6', 'Gendog88_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl1_rr0_1_2__Geo_scale_all_1_0', 'Gendog101_gendog__KneeNum_fl1_fr1_rl1_rr1__ScaleJointLimit_fl1_fr0_rl1_rr0_0_8__Geo_scale_all_0_8', 'Gendog112_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl0_fr0_rl0_rr0_1_0__Geo_scale_all_0_8', 'Gendog135_gendog__KneeNum_fl2_fr2_rl2_rr2__ScaleJointLimit_fl1_fr0_rl0_rr0_0_8__Geo

In [7]:
# Paths for submission and deletion scripts
submission_script = os.path.join(output_folder, "submit_all_jobs.sh")
deletion_script = os.path.join(output_folder, "delete_all_jobs.sh")

# Get all job files generated
job_files = [f for f in os.listdir(output_folder) if f.endswith(".yaml")]

# Generate submission script
with open(submission_script, "w") as f:
    f.write("#!/bin/bash\n\n")
    for job_file in job_files:
        f.write(f"kubectl create -f {os.path.join(output_folder, job_file)}\n")

# Make the submission script executable
os.chmod(submission_script, 0o755)

# Generate deletion script
with open(deletion_script, "w") as f:
    f.write("#!/bin/bash\n\n")
    for job_file in job_files:
        job_name = job_file.replace(".yaml", "")  # Extract job name from the file name
        f.write(f"kubectl delete job {job_name}\n")

# Make the deletion script executable
os.chmod(deletion_script, 0o755)

print(f"Submission script: {submission_script}")
print(f"Deletion script: {deletion_script}")


Submission script: jobs_scaling_factors/submit_all_jobs.sh
Deletion script: jobs_scaling_factors/delete_all_jobs.sh
