# YOLO Hyperparameter Tuning Job

This notebook creates a Databricks job to run the YOLO multi-GPU training notebook with hyperparameter optimization using Optuna.


## Parameters

Define job configuration parameters.


In [None]:
%pip install -U databricks-sdk optuna
%restart_python

In [None]:
job_name = "finetune_yolo_job"
notebook_path = "/Workspace/Users/brian.law@databricks.com/.bundle/image-detection-and-tracking-on-db/dev/files/advanced_examples/finetune_ultralytics_yolo_multi_gpu"
init_script_path = "/Workspace/Users/brian.law@databricks.com/.bundle/image-detection-and-tracking-on-db/dev/files/scripts/init_script_ultralytics.sh"
max_concurrent_runs = 2

# Metrics output location (must match notebook's logging_vol_path)
metrics_volume_path = "/Volumes/brian_ml_dev/image_processing/training"

# Cluster configuration
spark_version = "17.3.x-scala2.13"  # MLR 17.3 LTS
node_type = "Standard_NC48ads_A100_v4"  # Azure NC48ads with 2x A100 GPUs

print(f"Job Configuration:")
print(f"  Job Name: {job_name}")
print(f"  Notebook: {notebook_path}")
print(f"  Init Script: {init_script_path}")
print(f"  Runtime: {spark_version}")
print(f"  Node Type: {node_type}")
print(f"  Max Concurrent Runs: {max_concurrent_runs}")
print(f"  Metrics Path: {metrics_volume_path}")


## Import Required Libraries

Load the Databricks SDK for job management.


In [None]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import jobs, compute
from databricks.sdk.service.compute import Kind


## Initialize Workspace Client

Create a client to interact with the Databricks workspace.


In [None]:
w = WorkspaceClient()
print("Workspace client initialized")


## Check if Job Already Exists

Search for existing jobs with the same name.


In [None]:
existing_jobs = w.jobs.list(name=job_name)
existing_job = None

for job in existing_jobs:
    if job.settings.name == job_name:
        existing_job = job
        break

if existing_job:
    print(f"Job '{job_name}' already exists with ID: {existing_job.job_id}")
else:
    print(f"Job '{job_name}' does not exist")


## Create Job if Not Exists

Create the job configuration with notebook task and parallelism settings.


In [None]:
if not existing_job:
    # Default parameters matching YOLO multi-GPU notebook
    base_parameters = {
        "epochs": "2",
        "batch_size": "128",
        "img_size": "640",
        "initial_lr": "0.005",
        "final_lr": "0.1",
        "device_config": "[0,1]",
        "run_name": "multi_gpu_run"
    }
    
    # Cluster configuration for YOLO training
    cluster_config = compute.ClusterSpec(
        spark_version=spark_version,
        node_type_id=node_type,
        num_workers=0,  # Single-node cluster (driver-only)
        driver_node_type_id=node_type,
        data_security_mode=DataSecurityMode.SINGLE_USER,
        kind=Kind.CLASSIC_PREVIEW,
        use_ml_runtime=True,
        spark_conf={
            "spark.databricks.cluster.profile": "singleNode",
            "spark.master": "local[*, 4]"
        },
        custom_tags={
            "ResourceClass": "SingleNode"
        },
        init_scripts=[
            compute.InitScriptInfo(
                workspace=compute.WorkspaceStorageInfo(
                    destination=init_script_path
                )
            )
        ]
    )
    
    created_job = w.jobs.create(
        name=job_name,
        tasks=[
            jobs.Task(
                task_key="finetune_task",
                notebook_task=jobs.NotebookTask(
                    notebook_path=notebook_path,
                    source=jobs.Source.WORKSPACE,
                    base_parameters=base_parameters
                ),
                new_cluster=cluster_config
            )
        ],
        max_concurrent_runs=max_concurrent_runs
    )
    
    print(f"\nâœ… Job '{job_name}' created successfully")
    print(f"Job ID: {created_job.job_id}")
    print(f"Max concurrent runs: {max_concurrent_runs}")
    print(f"\nCluster Configuration:")
    print(f"  Runtime: {spark_version}")
    print(f"  Node Type: {node_type}")
    print(f"  Workers: 0 (Single Node)")
    print(f"  Init Script: {init_script_path.split('/')[-1]}")
    print(f"\nDefault parameters:")
    for key, value in base_parameters.items():
        print(f"  {key}: {value}")
else:
    print(f"Job '{job_name}' already exists, skipping creation")


## Display Job Details

Show the job configuration details.


In [None]:
final_job = existing_job if existing_job else w.jobs.get(created_job.job_id)

print(f"\nJob Configuration:")
print(f"  Name: {final_job.settings.name}")
print(f"  Job ID: {final_job.job_id}")
print(f"  Max Concurrent Runs: {final_job.settings.max_concurrent_runs}")
print(f"  Notebook Path: {final_job.settings.tasks[0].notebook_task.notebook_path}")

# Display cluster configuration
task = final_job.settings.tasks[0]
if task.new_cluster:
    print(f"\nCluster Configuration:")
    print(f"  Spark Version: {task.new_cluster.spark_version}")
    print(f"  Node Type: {task.new_cluster.node_type_id}")
    print(f"  Workers: {task.new_cluster.num_workers}")
    if task.new_cluster.init_scripts:
        print(f"  Init Scripts: {len(task.new_cluster.init_scripts)} configured")

if task.notebook_task.base_parameters:
    print(f"\nDefault Parameters:")
    for key, value in task.notebook_task.base_parameters.items():
        print(f"  {key}: {value}")


## Trigger Job Run

Start a new run of the job.


In [None]:
job_id = final_job.job_id

# Default YOLO parameters for test run
test_params = {
    "epochs": "2",
    "batch_size": "128",
    "img_size": "640",
    "initial_lr": "0.005",
    "final_lr": "0.1",
    "device_config": "[0,1]",
    "run_name": "test_run"
}

run = w.jobs.run_now(
    job_id=job_id,
    notebook_params=test_params
)

print(f"\nJob run triggered successfully")
print(f"Run ID: {run.run_id}")
print(f"Parameters:")
for key, value in test_params.items():
    print(f"  {key}: {value}")


## Monitor Run Status

Check the status of the triggered run.


In [None]:
import time
import json
from databricks.sdk.service.jobs import RunLifeCycleState

print(f"Monitoring run {run.run_id}...")
print(f"Status updates:")

while True:
    run_status = w.jobs.get_run(run.run_id)
    state = run_status.state.life_cycle_state
    
    print(f"  {state}")
    
    if state in [RunLifeCycleState.TERMINATED, RunLifeCycleState.SKIPPED, RunLifeCycleState.INTERNAL_ERROR]:
        result_state = run_status.state.result_state
        print(f"\nFinal State: {result_state}")
        
        try:
            if run_status.tasks and len(run_status.tasks) > 0:
                task_run_id = run_status.tasks[0].run_id
                print(f"Retrieving output from task run: {task_run_id}")
                
                run_output = w.jobs.get_run_output(task_run_id)
                if run_output.notebook_output and run_output.notebook_output.result:
                    print(f"\nNotebook Output:")
                    metrics = json.loads(run_output.notebook_output.result)
                    print(json.dumps(metrics, indent=2))
        except Exception as e:
            print(f"\nCould not retrieve notebook output: {e}")
        
        break
    
    time.sleep(10)


# Hyperparameter Optimization with Optuna

Use Optuna to find the optimal YOLO hyperparameters (batch size, learning rates) by running multiple training experiments in parallel.


## Install and Import Optuna

Set up Optuna for hyperparameter optimization.


In [None]:
import optuna

print(f"Optuna version: {optuna.__version__}")


## Define Objective Function

Create a function that runs a training job with different hyperparameters and returns the metric to optimize.


In [None]:
# job_id = 281960318776979

def objective(trial):
    import time
    import json
    
    # Sample YOLO hyperparameters
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    initial_lr = trial.suggest_float("initial_lr", 0.001, 0.01, log=True)
    final_lr = trial.suggest_float("final_lr", 0.01, 0.5)
    
    # Fixed parameters for faster experimentation
    epochs = 25  # Adjust based on your needs
    img_size = 640
    device_config = "[0,1]"
    run_name = f"trial_{trial.number}"
    
    print(f"\nTrial {trial.number}:")
    print(f"  batch_size={batch_size}, initial_lr={initial_lr:.5f}, final_lr={final_lr:.3f}")
    
    # Prepare parameters matching the multi-GPU notebook widgets
    params = {
        "epochs": str(epochs),
        "batch_size": str(batch_size),
        "img_size": str(img_size),
        "initial_lr": str(initial_lr),
        "final_lr": str(final_lr),
        "device_config": device_config,
        "run_name": run_name
    }
    
    # Trigger job run
    run = w.jobs.run_now(
        job_id=job_id,
        notebook_params=params
    )
    
    job_run_id = run.run_id
    print(f"  Job Run ID: {job_run_id}")
    print(f"  Metrics file: {metrics_volume_path}/metrics_{job_run_id}.json")
    
    # Wait for job completion
    while True:
        run_status = w.jobs.get_run(job_run_id)
        state = run_status.state.life_cycle_state
        
        if state in [RunLifeCycleState.TERMINATED, RunLifeCycleState.SKIPPED, RunLifeCycleState.INTERNAL_ERROR]:
            result_state = run_status.state.result_state
            print(f"  Run completed: {result_state}")
            
            if result_state != jobs.RunResultState.SUCCESS:
                print(f"  Trial failed with state: {result_state}")
                raise optuna.TrialPruned()
            
            # Read metrics from saved JSON file
            try:
                time.sleep(5)  # Brief delay to ensure file is written
                
                metrics_file = f"{metrics_volume_path}/metrics_{job_run_id}.json"
                print(f"  Reading metrics from: {metrics_file}")
                
                with open(metrics_file, 'r') as f:
                    metrics = json.load(f)
                
                val_map = metrics.get("val_mAP50", 0.0)
                print(f"  Validation mAP@50: {val_map:.4f}")
                
                # Return val_mAP50 for maximization
                return val_map
                
            except Exception as e:
                print(f"  Error reading metrics file: {e}")
                import traceback
                traceback.print_exc()
                raise optuna.TrialPruned()
            
            break
        
        time.sleep(30)  # Check every 30 seconds (YOLO training takes time)
    
    raise optuna.TrialPruned()

print("Objective function defined for YOLO hyperparameter optimization")
print(f"Metrics will be read from: {metrics_volume_path}/metrics_<run_id>.json")


## Run Optimization Study

Execute Optuna study to find the best n_estimators value.


In [None]:
study = optuna.create_study(
    direction="maximize",  # Maximize mAP (we return negative, so minimize negative = maximize)
    study_name="yolo_hyperparameter_optimization"
)

n_trials = 5

print(f"Starting YOLO hyperparameter optimization with {n_trials} trials...")
print("Optimizing: Validation mAP@50 (higher is better)")
print("Hyperparameters being tuned:")
print("  - batch_size: [64, 128, 256]")
print("  - initial_lr: [0.001, 0.01] (log scale)")
print("  - final_lr: [0.01, 0.5]")
print()

study.optimize(objective, n_trials=n_trials, n_jobs=1)

print(f"\nOptimization complete!")


## Display Optimization Results

Show the best parameters and visualize the optimization history.


In [None]:
print("Best Trial:")
print(f"  Value (Validation mAP@50): {study.best_trial.value:.4f}")
print(f"  Parameters:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

print("\nAll Trials:")
trials_df = study.trials_dataframe()
# Rename value column to mAP@50 for clarity
trials_df['mAP@50'] = trials_df['value']
display(trials_df[["number", "mAP@50", "params_batch_size", "params_initial_lr", "params_final_lr", "state"]])


## Visualize Optimization History

Plot the optimization progress over trials.


In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

trial_numbers = [t.number for t in study.trials if t.value is not None]
trial_values = [t.value for t in study.trials if t.value is not None]
trial_batch_sizes = [t.params["batch_size"] for t in study.trials if t.value is not None]
trial_initial_lrs = [t.params["initial_lr"] for t in study.trials if t.value is not None]

# Optimization history
ax1.plot(trial_numbers, trial_values, marker='o', linewidth=2)
ax1.set_xlabel('Trial Number')
ax1.set_ylabel('Validation mAP@50')
ax1.set_title('Optimization History')
ax1.grid(True, alpha=0.3)

# Batch size vs mAP
ax2.scatter(trial_batch_sizes, trial_values, s=100, alpha=0.6)
ax2.set_xlabel('Batch Size')
ax2.set_ylabel('Validation mAP@50')
ax2.set_title('Batch Size vs mAP@50')
ax2.grid(True, alpha=0.3)

# Initial LR vs mAP
ax3.scatter(trial_initial_lrs, trial_values, s=100, alpha=0.6)
ax3.set_xlabel('Initial Learning Rate')
ax3.set_ylabel('Validation mAP@50')
ax3.set_title('Initial LR vs mAP@50')
ax3.set_xscale('log')
ax3.grid(True, alpha=0.3)

# Mark best trial on all plots
best_idx = trial_values.index(max(trial_values))
ax2.scatter([trial_batch_sizes[best_idx]], [trial_values[best_idx]], 
           s=200, color='red', marker='*', label='Best', zorder=5)
ax2.legend()
ax3.scatter([trial_initial_lrs[best_idx]], [trial_values[best_idx]], 
           s=200, color='red', marker='*', label='Best', zorder=5)
ax3.legend()

plt.tight_layout()
plt.show()

print(f"\nRecommendation:")
print(f"  Batch Size: {study.best_params['batch_size']}")
print(f"  Initial LR: {study.best_params['initial_lr']:.5f}")
print(f"  Final LR: {study.best_params['final_lr']:.3f}")
print(f"  Best mAP@50: {study.best_trial.value:.4f}")
