In [3]:
#!/usr/bin/env python3
"""
Simplified: Extract 10 stratified episodes PER TASK (independent).
Each task independently stratifies into 10 tiers and selects 1 episode per tier.
Total: 60 episodes (10 per task × 6 tasks)
"""

import json
import shutil
from pathlib import Path
import numpy as np

# Configuration
SOURCE_TASK_DIRS = [
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_button-press-topdown",
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_door-open",
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_drawer-close",
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_peg-insert-side",
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_pick-place",
    "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/dreamer_push"
]

OUTPUT_FOLDER = "/storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/stratified_per_task_10"

N_TIERS_PER_TASK = 10  # Each task split into 10 performance tiers
EPISODES_PER_TIER = 1  # 1 episode per tier → 10 total per task

def main():
    print(f"Per-Task Stratified Extraction")
    print(f"Configuration: Each task gets {N_TIERS_PER_TASK} episodes\n")
    
    output_path = Path(OUTPUT_FOLDER)
    output_path.mkdir(parents=True, exist_ok=True)
    
    total_copied = 0
    
    for task_dir in SOURCE_TASK_DIRS:
        task_name = Path(task_dir).name
        task_short = "_".join(task_name.split("_")[1:])  # Remove 'dreamer_' prefix
        
        metrics_file = Path(task_dir) / "metrics.jsonl"
        train_eps_folder = Path(task_dir) / "train_eps"
        
        if not metrics_file.exists() or not train_eps_folder.exists():
            print(f"⚠ {task_short}: Missing files, skipping")
            continue
        
        # Load episodes for this task
        episodes = []
        with open(metrics_file, 'r') as f:
            for line in f:
                data = json.loads(line)
                if 'train_episodes' in data and 'train_return' in data:
                    ep_idx = int(data['train_episodes']) - 1
                    train_return = data['train_return']
                    episodes.append((ep_idx, train_return))
        
        # Sort by score (descending)
        sorted_episodes = sorted(episodes, key=lambda x: x[1], reverse=True)
        
        # Stratify this task independently
        tier_size = len(sorted_episodes) // N_TIERS_PER_TASK
        selected_episodes = []
        
        print(f"{task_short}:")
        print(f"  Total episodes: {len(episodes)}")
        print(f"  Extracting 1 episode from each of {N_TIERS_PER_TASK} tiers")
        
        for tier_idx in range(N_TIERS_PER_TASK):
            tier_start = tier_idx * tier_size
            tier_end = (tier_idx + 1) * tier_size if tier_idx < N_TIERS_PER_TASK - 1 else len(sorted_episodes)
            
            tier_episodes = sorted_episodes[tier_start:tier_end]
            
            if len(tier_episodes) > 0:
                # Pick middle episode from this tier
                mid_idx = len(tier_episodes) // 2
                ep_idx = tier_episodes[mid_idx][0]
                selected_episodes.append(ep_idx)
        
        # Copy selected episodes
        task_output_folder = output_path / task_short
        task_output_folder.mkdir(parents=True, exist_ok=True)
        
        all_npz_files = sorted(train_eps_folder.glob('*.npz'))
        copied = 0
        
        for ep_idx in sorted(selected_episodes):
            if ep_idx < len(all_npz_files):
                src = all_npz_files[ep_idx]
                dst = task_output_folder / src.name
                shutil.copy2(src, dst)
                copied += 1
        
        total_copied += copied
        print(f"  ✓ Copied {copied} episodes\n")
    
    print(f"TOTAL: {total_copied} episodes across all tasks")
    print(f"Output: {output_path}")

if __name__ == "__main__":
    main()

Per-Task Stratified Extraction
Configuration: Each task gets 10 episodes

button-press-topdown:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

door-open:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

drawer-close:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

peg-insert-side:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

pick-place:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

push:
  Total episodes: 1740
  Extracting 1 episode from each of 10 tiers
  ✓ Copied 10 episodes

TOTAL: 60 episodes across all tasks
Output: /storage/ssd1/richtsai1103/vid2act/pretrain/metaworld/pretrain_data/stratified_per_task_10
