# Manual Thunder Compute DeepRacer Deployment Guide

## Overview

The Thunder Compute CLI provides a `--files-only` mode that generates all necessary configuration files without creating an instance. 

1. **Generate Configuration Files**: Create all model and environment files locally
2. **Create Instance Manually**: Set up a Thunder Compute instance with custom specifications
3. **Upload Files**: Transfer configuration and model files to the instance
4. **Start Training**: Launch DeepRacer training with your custom configuration

```bash
poetry run thunder-compute deploy-training time-trail \
  -r rewards/centerline_following/reward_function.py \
  --workers 4 \
  --same-race \
  --files-only \
  --algorithm sac
```

This command generates files for a 4-worker training session using the SAC algorithm with a centerline following reward function.

## 1. Setup Environment and Dependencies

In [None]:
import os
import json
import subprocess
import time
from pathlib import Path
from typing import Dict, List, Optional

from deepracer_research.utils.logger import info, warning, error

PROJECT_ROOT = Path("/Users/bartmlynarkiewicz/msc/notebooks/deepracer-research")
MODEL_ID = "time-trail"
REWARD_FUNCTION_PATH = PROJECT_ROOT / "rewards/centerline_following/reward_function.py"

info(f"Project Root: {PROJECT_ROOT}")
info(f"Model ID: {MODEL_ID}")
info(f"Reward Function: {REWARD_FUNCTION_PATH}")
info(f"Reward function exists: {REWARD_FUNCTION_PATH.exists()}")

os.chdir(PROJECT_ROOT)
info(f"Current working directory: {os.getcwd()}")

In [None]:
def verify_thunder_cli():
    """Verify that Thunder Compute CLI is accessible."""
    try:
        result = subprocess.run(
            ["poetry", "run", "thunder-compute", "--help"],
            capture_output=True,
            text=True,
            cwd=PROJECT_ROOT
        )
        if result.returncode == 0:
            info("Thunder Compute CLI is accessible")
            return True
        else:
            info("Thunder Compute CLI not accessible")
            error(f"Error: {result.stderr}")
            return False
    except Exception as e:
        error(f"Error checking Thunder Compute CLI: {e}")
        return False

def check_api_credentials():
    """Check if Thunder Compute API credentials are configured."""
    token = os.getenv("THUNDER_API_TOKEN")
    if token:
        info("THUNDER_API_TOKEN is configured")
        info(f"Token preview: {token[:10]}...")
        return True
    else:
        error("THUNDER_API_TOKEN environment variable not set")
        error("Set it with: export THUNDER_API_TOKEN=your_token_here")
        return False

info("Verifying Thunder Compute setup...")
cli_ok = verify_thunder_cli()
creds_ok = check_api_credentials()

if cli_ok and creds_ok:
    info("Thunder Compute environment is ready!")
else:
    warning("Please fix the issues above before proceeding")

## 2. Generate Training Configuration Files

```bash
poetry run thunder-compute deploy-training time-trail \
  -r rewards/centerline_following/reward_function.py \
  --workers 4 \
  --same-race \
  --files-only \
  --algorithm sac
```

### Understanding the CLI Flow

The `--files-only` mode in the CLI performs these key steps:
1. **Creates Essential Model Files**: Generates `hyperparameters.json`, `model_metadata.json`, and `reward_function.py`
2. **Creates S3 Bucket**: Sets up an S3 bucket for model storage
3. **Generates Environment Files**: Creates `system.env`, `run.env` for DeepRacer-for-Cloud
4. **Creates Deployment Info**: Saves metadata about the configuration in `deployment_info.json`

In [None]:
def generate_config_files():
    """Generate all configuration files using the Thunder Compute CLI with --files-only."""
    
    cmd = [
        "poetry", "run", "thunder-compute", "deploy-training", MODEL_ID,
        "--reward-function", str(REWARD_FUNCTION_PATH),
        "--workers", "4",
        "--same-race",
        "--files-only",
        "--algorithm", "sac"
    ]
    
    info(f"Generating configuration files...")
    info(f"Command: {' '.join(cmd)}")
    info("This may take a few moments...")

    try:
        result = subprocess.run(
            cmd,
            cwd=PROJECT_ROOT,
            capture_output=True,
            text=True,
            timeout=300
        )
        
        if result.returncode == 0:
            info("Configuration files generated successfully!")
            info("\nCLI Output:")
            info(result.stdout)
            return True
        else:
            info("Error generating configuration files")
            info(f"Error: {result.stderr}")
            info(f"Output: {result.stdout}")
            return False
            
    except subprocess.TimeoutExpired:
        info("Command timed out after 5 minutes")
        return False
    except Exception as e:
        info(f"Error running command: {e}")
        return False

success = generate_config_files()

In [None]:
def inspect_generated_files():
    """Inspect the generated files and directory structure."""
    
    models_dir = PROJECT_ROOT / "models" / MODEL_ID
    thunder_files_dir = PROJECT_ROOT / "thunder_files" / MODEL_ID

    info("Inspecting generated files...")
    info(f"Models directory: {models_dir}")
    info(f"Thunder files directory: {thunder_files_dir}")
    
    if models_dir.exists():
        info(f"Model files in {models_dir}:")
        for file in sorted(models_dir.glob("*")):
            info(f"{file.name}")
            if file.suffix == ".json" and file.stat().st_size < 2000:
                try:
                    with open(file, 'r') as f:
                        content = json.load(f)
                    info(f"      Preview: {json.dumps(content, indent=2)[:200]}...")
                except:
                    pass
    else:
        info(f"Models directory not found: {models_dir}")

    if thunder_files_dir.exists():
        info(f"Thunder Compute files in {thunder_files_dir}:")
        for file in sorted(thunder_files_dir.glob("*")):
            info(f"{file.name}")
            if file.suffix in [".env", ".json"] and file.stat().st_size < 1000:
                try:
                    with open(file, 'r') as f:
                        content = f.read()
                    info(f"      Preview: {content[:200]}...")
                except:
                    pass
    else:
        info(f"Thunder files directory not found: {thunder_files_dir}")

    return models_dir, thunder_files_dir

models_dir, thunder_files_dir = inspect_generated_files()

In [None]:
def test_model_metadata_fix():
    """Test that model_metadata.json is now being created correctly."""
    
    info("Testing model_metadata.json generation fix...")
    
    # Re-run the configuration generation to test the fix
    cmd = [
        "poetry", "run", "thunder-compute", "deploy-training", "test-metadata",
        "--reward-function", str(REWARD_FUNCTION_PATH),
        "--workers", "1",
        "--same-race",
        "--files-only",
        "--algorithm", "sac"
    ]
    
    info(f"Command: {' '.join(cmd)}")

    try:
        result = subprocess.run(
            cmd,
            cwd=PROJECT_ROOT,
            capture_output=True,
            text=True,
            timeout=300
        )
        
        if result.returncode == 0:
            info("✅ Configuration files generated successfully!")
            
            # Check if model_metadata.json was created
            test_models_dir = PROJECT_ROOT / "models"
            test_model_dirs = [d for d in test_models_dir.glob("*") if d.is_dir()]
            
            if test_model_dirs:
                latest_model_dir = max(test_model_dirs, key=lambda d: d.stat().st_mtime)
                metadata_file = latest_model_dir / "model_metadata.json"
                
                if metadata_file.exists():
                    info(f"✅ model_metadata.json found in {latest_model_dir.name}")
                    
                    # Show the content
                    with open(metadata_file, 'r') as f:
                        metadata_content = json.load(f)
                    info(f"Content: {json.dumps(metadata_content, indent=2)}")
                    
                    # Verify sensor field
                    if "sensor" in metadata_content:
                        info(f"✅ Sensor field: {metadata_content['sensor']}")
                    else:
                        info("❌ Missing sensor field in metadata")
                    
                    return True
                else:
                    info(f"❌ model_metadata.json still missing in {latest_model_dir.name}")
                    info(f"Files found: {list(latest_model_dir.glob('*'))}")
                    return False
            else:
                info("❌ No model directories found")
                return False
        else:
            info(f"❌ Error: {result.stderr}")
            return False
            
    except Exception as e:
        info(f"❌ Exception: {e}")
        return False

# Test the fix
test_success = test_model_metadata_fix()

## 3. Create Thunder Compute Instance

In [None]:
def create_thunder_instance():
    """Create a Thunder Compute instance for DeepRacer training."""
    
    cmd = [
        "poetry", "run", "thunder-compute", "create",
        "--preset", "training", 
        "--cpu-cores", "16",    
        "--gpu-type", "a100-xl",       
        "--disk-size", "200",          
        "--no-wait"                    
    ]
    
    info("Creating Thunder Compute instance...")
    info(f"Command: {' '.join(cmd)}")
    info("This will take 2-3 minutes...")
    
    try:
        result = subprocess.run(
            cmd,
            cwd=PROJECT_ROOT,
            capture_output=True,
            text=True,
            timeout=300
        )
        
        if result.returncode == 0:
            info("Instance creation initiated!")
            info("\nCLI Output:")
            info(result.stdout)
            
            output_lines = result.stdout.split('\n')
            instance_uuid = None
            for line in output_lines:
                if "UUID:" in line:
                    instance_uuid = line.split("UUID:")[-1].strip()
                    break
            
            if instance_uuid:
                info(f"\nInstance UUID: {instance_uuid}")
                return instance_uuid
            else:
                info("Could not extract instance UUID from output")
                return None
        else:
            info("Error creating instance")
            info(f"Error: {result.stderr}")
            info(f"Output: {result.stdout}")
            return None
            
    except subprocess.TimeoutExpired:
        info("Command timed out after 5 minutes")
        return None
    except Exception as e:
        info(f"Error running command: {e}")
        return None

instance_uuid = create_thunder_instance()

if instance_uuid:
    info(f"Saving instance UUID for later use: {instance_uuid}")
else:
    info("No instance UUID available. You may need to check Thunder Compute manually.")

In [None]:
def wait_for_instance_ready(instance_uuid: str, max_wait_minutes: int = 10):
    """Wait for the Thunder Compute instance to be ready for SSH connections."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return False

    info(f"Waiting for instance {instance_uuid[:8]}... to be ready (max {max_wait_minutes} minutes)")

    start_time = time.time()
    max_wait_seconds = max_wait_minutes * 60
    
    while time.time() - start_time < max_wait_seconds:
        try:
            cmd = ["poetry", "run", "thunder-compute", "status", instance_uuid]
            result = subprocess.run(
                cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=30
            )
            
            if result.returncode == 0:
                output = result.stdout
                if "running" in output.lower() and "ssh ready" in output.lower():
                    info("Instance is ready for SSH connections!")
                    return True
                elif "failed" in output.lower():
                    info("Instance creation failed")
                    info(output)
                    return False
                else:
                    info(f"Instance status: {output.split('Status')[-1].split('|')[0].strip() if 'Status' in output else 'checking...'}")

            time.sleep(30)
            
        except Exception as e:
            info(f"Error checking status: {e}")
            time.sleep(30)

    info(f"Timeout after {max_wait_minutes} minutes")
    return False

if instance_uuid:
    info("Checking instance status...")
    is_ready = wait_for_instance_ready(instance_uuid, max_wait_minutes=10)
    
    if is_ready:
        info("Instance is ready for file uploads!")
    else:
        info("Instance may still be starting up. You can check status manually with:")
        info(f"   poetry run thunder-compute status {instance_uuid}")
else:
    info("Skipping instance readiness check - no UUID available")
    info("If you have an existing instance, set instance_uuid manually:")
    info("instance_uuid = 'your-instance-uuid-here'")

## 4. Upload Model Files to Instance

In [None]:
def upload_model_files(instance_uuid: str, models_dir: Path):
    """Upload model files to the Thunder Compute instance."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return False
    
    if not models_dir.exists():
        info(f"Models directory not found: {models_dir}")
        return False
    
    remote_model_path = f"~/deepracer-for-cloud/data/minio/bucket/custom_files/{MODEL_ID}"

    info(f"Uploading model files from {models_dir}")
    info(f"Target location: {remote_model_path}")

    try:
        cmd = [
            "poetry", "run", "thunder-compute", "upload",
            instance_uuid,
            str(models_dir),
            remote_model_path
        ]

        info(f"Command: {' '.join(cmd)}")

        result = subprocess.run(
            cmd,
            cwd=PROJECT_ROOT,
            capture_output=True,
            text=True,
            timeout=300
        )
        
        if result.returncode == 0:
            info("Model files uploaded successfully!")
            info(f"Output: {result.stdout}")
            return True
        else:
            info("Error uploading model files")
            info(f"Error: {result.stderr}")
            info(f"Output: {result.stdout}")
            return False
            
    except subprocess.TimeoutExpired:
        info("Upload timed out after 5 minutes")
        return False
    except Exception as e:
        error(f"Error during upload: {e}")
        return False

if instance_uuid and models_dir.exists():
    info("Starting model file upload...")
    model_upload_success = upload_model_files(instance_uuid, models_dir)
else:
    info("Skipping model file upload:")
    if not instance_uuid:
        info("   - No instance UUID available")
    if not models_dir.exists():
        info(f"   - Models directory not found: {models_dir}")

    info("You can upload manually later with:")
    info(f"   poetry run thunder-compute upload <instance_uuid> {models_dir} ~/deepracer-for-cloud/data/minio/bucket/custom_files/{MODEL_ID}")
    model_upload_success = False

## 5. Upload Configuration Files to Instance

In [None]:
def upload_config_files(instance_uuid: str, thunder_files_dir: Path):
    """Upload environment configuration files to the Thunder Compute instance."""
    
    if not instance_uuid:
        warning("No instance UUID provided")
        return False
    
    if not thunder_files_dir.exists():
        warning(f"Thunder files directory not found: {thunder_files_dir}")
        return False
    
    remote_config_path = "~/deepracer-for-cloud"

    info(f"Uploading configuration files from {thunder_files_dir}")
    info(f"Target location: {remote_config_path}")

    try:
        config_files = list(thunder_files_dir.glob("*.env"))
        json_files = list(thunder_files_dir.glob("*.json"))

        info(f"Files to upload:")
        for file in sorted(config_files + json_files):
            info(f"   {file.name}")
        
        cmd = [
            "poetry", "run", "thunder-compute", "upload",
            instance_uuid,
            str(thunder_files_dir),
            remote_config_path
        ]

        info(f"Command: {' '.join(cmd)}")

        result = subprocess.run(
            cmd,
            cwd=PROJECT_ROOT,
            capture_output=True,
            text=True,
            timeout=300
        )
        
        if result.returncode == 0:
            info("Configuration files uploaded successfully!")
            info(f" Output: {result.stdout}")
            return True
        else:
            info(" Error uploading configuration files")
            info(f"Error: {result.stderr}")
            info(f"Output: {result.stdout}")
            return False
            
    except subprocess.TimeoutExpired:
        info("Upload timed out after 5 minutes")
        return False
    except Exception as e:
        info(f"Error during upload: {e}")
        return False

if instance_uuid and thunder_files_dir.exists():
    info("Starting configuration file upload...")
    config_upload_success = upload_config_files(instance_uuid, thunder_files_dir)
else:
    info("Skipping configuration file upload:")
    if not instance_uuid:
        info("   - No instance UUID available")
    if not thunder_files_dir.exists():
        info(f"   - Thunder files directory not found: {thunder_files_dir}")

    info("You can upload manually later with:")
    info(f"   poetry run thunder-compute upload <instance_uuid> {thunder_files_dir} ~/deepracer-for-cloud")
    config_upload_success = False

## 6. Verify File Upload and Structure


In [None]:
def verify_file_structure(instance_uuid: str):
    """Verify that files are properly uploaded to the Thunder Compute instance."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return False
    
    verification_commands = [
        "ls -la ~/deepracer-for-cloud/",
        
        "ls -la ~/deepracer-for-cloud/*.env",
        
        f"ls -la ~/deepracer-for-cloud/data/minio/bucket/custom_files/{MODEL_ID}/",
        
        "head -10 ~/deepracer-for-cloud/system.env",
        
        "head -10 ~/deepracer-for-cloud/run.env",
        
        f"cat ~/deepracer-for-cloud/data/minio/bucket/custom_files/{MODEL_ID}/model_metadata.json",
    ]

    info(f"Verifying file structure on instance {instance_uuid[:8]}...")

    all_success = True
    for cmd in verification_commands:
        info(f"\\n🔍 Running: {cmd}")
        
        try:
            exec_cmd = ["poetry", "run", "thunder-compute", "exec", instance_uuid, cmd]
            
            result = subprocess.run(
                exec_cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=60
            )
            
            if result.returncode == 0:
                info("Success:")
                info(result.stdout)
            else:
                info("Error:")
                info(result.stderr)
                all_success = False
                
        except subprocess.TimeoutExpired:
            info("Command timed out")
            all_success = False
        except Exception as e:
            info(f"Error executing command: {e}")
            all_success = False
    
    return all_success

if instance_uuid and model_upload_success and config_upload_success:
    info("Starting file structure verification...")
    verification_success = verify_file_structure(instance_uuid)
    
    if verification_success:
        info("File verification completed successfully!")
    else:
        info("Some verification checks failed. Please review the output above.")
else:
    info("Skipping verification - uploads may not have completed successfully")
    info("You can verify manually by SSH'ing to the instance:")
    info(f"   poetry run thunder-compute ssh {instance_uuid}")
    verification_success = False

## 7. Start Training Session

In [None]:
def start_training_session(instance_uuid: str):
    """Start the DeepRacer training session on the Thunder Compute instance."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return False

    info(f"Starting DeepRacer training on instance {instance_uuid[:8]}...")

    training_commands = [
        "cd ~/deepracer-for-cloud",
        
        "./bin/stop.sh",
        
        "./bin/init.sh",
        
        "./bin/start.sh",
    ]
    
    for i, cmd in enumerate(training_commands, 1):
        info(f"Step {i}: {cmd}")

        try:
            exec_cmd = ["poetry", "run", "thunder-compute", "exec", instance_uuid, cmd]
            
            result = subprocess.run(
                exec_cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=120
            )
            
            if result.returncode == 0:
                info("Success:")
                info(result.stdout[:500] + "..." if len(result.stdout) > 500 else result.stdout)
            else:
                info("Error:")
                info(result.stderr)
                info("Output:", result.stdout)
                if i == len(training_commands):
                    return False
                    
        except subprocess.TimeoutExpired:
            info("Command timed out")
            if i == len(training_commands):
                return False
        except Exception as e:
            info(f"Error executing command: {e}")
            if i == len(training_commands):
                return False
    
    return True

def check_training_status(instance_uuid: str):
    """Check if training is running successfully."""
    
    status_commands = [
        "docker ps",
        
        "tail -20 ~/deepracer-for-cloud/data/logs/training/training.log",
        
        "nvidia-smi"
    ]
    
    info(f"\\nChecking training status on instance {instance_uuid[:8]}...")
    
    for cmd in status_commands:
        info(f"\\nRunning: {cmd}")

        try:
            exec_cmd = ["poetry", "run", "thunder-compute", "exec", instance_uuid, cmd]
            
            result = subprocess.run(
                exec_cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=60
            )
            
            if result.returncode == 0:
                info(result.stdout)
            else:
                info(f"Error: {result.stderr}")
                
        except Exception as e:
            error(f"Error executing command: {e}")

if instance_uuid and verification_success:
    info("Starting training session...")
    training_started = start_training_session(instance_uuid)
    
    if training_started:
        info("\\nTraining session started!")
        info("Waiting 30 seconds for containers to start up...")
        time.sleep(30)
        check_training_status(instance_uuid)
    else:
        info("Failed to start training session")
else:
    info("Skipping training start - prerequisites not met")
    info("You can start training manually by SSH'ing to the instance and running:")
    info("   cd ~/deepracer-for-cloud")
    info("   ./bin/stop.sh")
    info("   ./bin/init.sh")
    info("   ./bin/start.sh")

## 8. Monitor Training Progress

In [None]:
def monitor_training_progress(instance_uuid: str, duration_minutes: int = 5):
    """Monitor training progress for a specified duration."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return

    info(f"Monitoring training progress for {duration_minutes} minutes...")
    info(f"Instance: {instance_uuid[:8]}...")

    start_time = time.time()
    end_time = start_time + (duration_minutes * 60)
    
    while time.time() < end_time:
        info(f"{time.strftime('%H:%M:%S')} - Checking training status...")
        
        try:
            cmd = ["poetry", "run", "thunder-compute", "logs", instance_uuid]
            result = subprocess.run(
                cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=60
            )
            
            if result.returncode == 0:
                logs = result.stdout
                # Show last 10 lines of logs
                log_lines = logs.split('\\n')
                recent_logs = '\\n'.join(log_lines[-10:])
                info("Recent training logs:")
                info(recent_logs)
            else:
                info(f"Error getting logs: {result.stderr}")

        except Exception as e:
            info(f"Error monitoring: {e}")

        if time.time() + 60 < end_time:
            info("Waiting 60 seconds...")
            time.sleep(60)
        else:
            break
    
    info(f"Monitoring completed after {duration_minutes} minutes")

def get_training_artifacts(instance_uuid: str):
    """Retrieve information about training artifacts and progress."""
    
    if not instance_uuid:
        info("No instance UUID provided")
        return

    info(f"Checking training artifacts on instance {instance_uuid[:8]}...")

    artifact_commands = [
        "ls -la ~/deepracer-for-cloud/data/minio/bucket/models/",
        
        "ls -la ~/deepracer-for-cloud/data/minio/bucket/mp4/",
        
        "ls -la ~/deepracer-for-cloud/data/logs/training/",
        
        "ls -la ~/deepracer-for-cloud/data/logs/robomaker/",
        
        "df -h ~/deepracer-for-cloud/data/",
    ]
    
    for cmd in artifact_commands:
        info(f"Running: {cmd}")

        try:
            exec_cmd = ["poetry", "run", "thunder-compute", "exec", instance_uuid, cmd]
            
            result = subprocess.run(
                exec_cmd,
                cwd=PROJECT_ROOT,
                capture_output=True,
                text=True,
                timeout=60
            )
            
            if result.returncode == 0:
                info(result.stdout)
            else:
                info(f"Error: {result.stderr}")

        except Exception as e:
            info(f"Error executing command: {e}")

if instance_uuid:
    info("Starting training monitoring...")
    info("You can stop this monitoring at any time with Ctrl+C")

    try:
        monitor_training_progress(instance_uuid, duration_minutes=5)

        info("Checking training artifacts...")
        get_training_artifacts(instance_uuid)
        
    except KeyboardInterrupt:
        info("Monitoring stopped by user")

else:
    info("No instance UUID available for monitoring")
    info("You can monitor manually with:")
    info("   poetry run thunder-compute logs <instance_uuid>")
    info("   poetry run thunder-compute ssh <instance_uuid>")