# DDPG Training for Balancing Robot

This notebook trains a DDPG agent for the balancing robot environment using the provided configurations.

In [1]:
import sys
sys.path.append('..')

import torch
import numpy as np
from pathlib import Path
import yaml
import matplotlib.pyplot as plt

from src.balancing_robot.models import Actor, Critic, ReplayBuffer
from src.balancing_robot.environment import BalancerEnv
from src.balancing_robot.training import DDPGTrainer
from src.balancing_robot.visualization import plot_training_metrics, create_episode_animation

## Load Configurations

In [2]:
# Load DDPG and environment configurations
with open('../configs/ddpg_config.yaml', 'r') as f:
    ddpg_config = yaml.safe_load(f)

with open('../configs/env_config.yaml', 'r') as f:
    env_config = yaml.safe_load(f)

# Create log directory
log_dir = Path('logs/ddpg_training')
log_dir.mkdir(parents=True, exist_ok=True)

## Initialize Environment and Models

In [3]:
# Set random seeds from config
torch.manual_seed(ddpg_config['training'].get('random_seed', 42))
np.random.seed(ddpg_config['training'].get('random_seed', 42))

# Create environment
env = BalancerEnv(
    config_path='../configs/env_config.yaml',
    render_mode='rgb_array'
)

# Initialize trainer with config
trainer = DDPGTrainer(
    env=env,
    config_path='../configs/ddpg_config.yaml'
)

# Print model summaries
trainer.print_model_info()

  gym.logger.warn(
  gym.logger.warn(


Actor(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=8, bias=True)
    (1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Linear(in_features=8, out_features=8, bias=True)
    (4): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (5): ReLU()
  )
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
)
Critic(
  (l1): Linear(in_features=7, out_features=256, bias=True)
  (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (hidden_layers): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
  )
  (output_layer): Linear(in_features=256, out_features=1, bias=True)
)


## Training

In [4]:
# Extract training parameters from config
train_config = ddpg_config['training']

# Train agent
history = trainer.train(
    num_episodes=train_config['total_episodes'],
    max_steps=train_config['max_steps_per_episode'],
    batch_size=train_config['batch_size'],
    eval_freq=train_config['eval_frequency'],
    save_freq=train_config['save_frequency'],
    log_dir=log_dir
)

states shape: (512, 6)
actions shape: (512,)
rewards shape: (512,)
next_states shape: (512, 6)
dones shape: (512,)
states shape: torch.Size([512, 6])
actions shape: torch.Size([512])
rewards shape: torch.Size([512])
next_states shape: torch.Size([512, 6])
dones shape: torch.Size([512])


RuntimeError: Tensors must have same number of dimensions: got 2 and 1

## Analysis and Visualization

In [None]:
# Plot training metrics
fig = plot_training_metrics(history, save_path=log_dir / 'training_metrics.png')
plt.show()

In [None]:
# Create demonstration video
def collect_demo_episode(max_steps=500):
    state = env.reset()
    states = []
    total_reward = 0

    for _ in range(max_steps):
        action = trainer.select_action(state, training=False)
        next_state, reward, done, info = env.step(action)
        states.append(state)
        total_reward += reward
        
        if done:
            break
            
        state = next_state
    
    return np.array(states), total_reward

# Collect several episodes and use the best one for visualization
num_episodes = 5
best_reward = float('-inf')
best_states = None

for _ in range(num_episodes):
    states, reward = collect_demo_episode()
    if reward > best_reward:
        best_reward = reward
        best_states = states

print(f"Best episode reward: {best_reward:.2f}")

# Create and display animation
anim = create_episode_animation(
    states=best_states,
    save_path=log_dir / 'demo.mp4'
)
from IPython.display import HTML
HTML(anim.to_jshtml())

## Save Final Model with Metadata

In [None]:
# Save final model with config and training history
torch.save({
    'actor_state_dict': trainer.actor.state_dict(),
    'critic_state_dict': trainer.critic.state_dict(),
    'training_history': history,
    'config': ddpg_config,
    'env_config': env_config,
    'metadata': {
        'state_dim': env.observation_space.shape[0],
        'action_dim': env.action_space.shape[0],
        'max_action': float(env.action_space.high[0]),
        'final_eval_reward': trainer.evaluate(num_episodes=10)
    }
}, log_dir / 'final_model.pt')

print("Training complete! Model saved with metadata.")