In [1]:
import torch
import gym

import numpy as np

from ji_dog_net_v1 import PPO_Clip
from ji_dog_net_v1 import ActorCritic_Clip
from ji_dog_net_v1 import process_state

from ji_dog_net_v2 import PPO_Penalty
from ji_dog_net_v2 import ActorCritic_Penalty

# from ji_dog_net_v3 import PPO_Clip
# from ji_dog_net_v3 import ActorCritic_Clip
# from ji_dog_net_v3 import process_state


## Check device

In [2]:
print("============================================================================================")
# set device to cpu or cuda
device = torch.device('cpu')
if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
print("============================================================================================")


Device set to : NVIDIA GeForce RTX 4060 Laptop GPU


## Test ACNet

In [3]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)


Action output: tensor([[ 0.3450, -0.7188, -0.6027,  0.9508]], device='cuda:0')
Action log probability: tensor([-4.0740], device='cuda:0')
State value output: tensor([-0.0086], device='cuda:0', grad_fn=<ViewBackward0>)


In [4]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Penalty(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2, kl_target=0.01, kl_penalty_coef=0.5, action_std_init=0.6)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)

Action output: tensor([[-0.3113,  0.8865,  0.2939,  0.5635]], device='cuda:0')
Action log probability: tensor([-3.4099], device='cuda:0')
State value output: tensor([0.0464], device='cuda:0', grad_fn=<ViewBackward0>)


## Train PPO

In [None]:

# Environment and simulation setup
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": True})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os

if __name__ == "__main__":
    relative_path = "Model(including video)/ji_dog1.0.usd"
    env = Ji_Dog_Env(relative_path)
    state_dim = 20
    action_dim = 4
    max_training_timesteps = 1000
    max_timesteps = 5000
    ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

    # Initialize 
    writer = SummaryWriter(log_dir="runs/Ji_Dog_Training1")

    for episode in tqdm(range(max_training_timesteps), desc="Training Episodes"):
        state = env.reset()
        state = process_state(state)
        state = np.array(state)
        episode_reward = 0
        for t in range(max_timesteps + 1):
            action = ppo.select_action(state)
            state, reward, done, _ = env.step(action[0])
            state = process_state(state)
            state = np.array(state)
            ppo.buffer.rewards.append(reward)
            ppo.buffer.is_terminals.append(done)
            episode_reward = episode_reward + reward
            if done:
                break
                    
        # Update PPO and record metrics
        average_loss, policy_loss, value_loss, policy_entropy = ppo.update()
        
        # Log metrics to TensorBoard
        writer.add_scalar("Average Loss", average_loss, episode)
        writer.add_scalar("Policy Entropy", policy_entropy, episode)
        writer.add_scalar("Value Loss", value_loss, episode)
        writer.add_scalar("Policy Loss", policy_loss, episode)
        writer.add_scalar("Episode Reward", episode_reward, episode)
        
        if episode % 10 == 0:
            print(f"Episode {episode} completed with reward: {episode_reward}")

    writer.close()

save_path = "Model_Checkpoints/Ji_dog_{}_Episode.pth".format('2.0')
print("save checkpoint path : " + save_path)
ppo.save(save_path)
print('Finished!')



Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable', '--no-window', '--/app/window/hideUi=1']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/run

Training Episodes:   0%|                               | 0/1000 [00:00<?, ?it/s]

In [None]:

simulation_app.close()
