In [1]:
import torch
import gym

import numpy as np

# from ji_dog_net_v1 import PPO_Clip
# from ji_dog_net_v1 import ActorCritic_Clip
# from ji_dog_net_v1 import process_state

from ji_dog_net_v2 import PPO_Penalty
from ji_dog_net_v2 import ActorCritic_Penalty

from ji_dog_net_v3 import PPO_Clip
from ji_dog_net_v3 import ActorCritic_Clip
from ji_dog_net_v3 import process_state


## Check device

In [2]:
print("============================================================================================")
# set device to cpu or cuda
device = torch.device('cpu')
if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
print("============================================================================================")


Device set to : NVIDIA GeForce RTX 4060 Laptop GPU


## Test ACNet

In [3]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)


Action output: tensor([[-0.0508,  0.6503, -0.2851, -0.2528]], device='cuda:0')
Action log probability: tensor([-2.3887], device='cuda:0')
State value output: tensor([0.0153], device='cuda:0', grad_fn=<ViewBackward0>)


In [4]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Penalty(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2, kl_target=0.01, kl_penalty_coef=0.5, action_std_init=0.6)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)

Action output: tensor([[ 0.3494,  0.0950,  0.5209, -0.2206]], device='cuda:0')
Action log probability: tensor([-2.3292], device='cuda:0')
State value output: tensor([0.0775], device='cuda:0', grad_fn=<ViewBackward0>)


## Train PPO

In [None]:
# Environment and simulation setup
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": True})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os

if __name__ == "__main__":
    # script_dir = os.path.dirname(os.path.abspath(__file__))
    relative_path = "Model(including video)/ji_dog1.0.usd"
    # usd_path = os.path.join(script_dir, relative_path)
    env = Ji_Dog_Env(relative_path)
    state_dim = 20
    action_dim = 4
    max_training_timesteps = 1000
    max_timesteps = 5000
    ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

    # Initialize 
    writer = SummaryWriter(log_dir="runs/Ji_Dog_Training1")

    for episode in tqdm(range(max_training_timesteps), desc="Training Episodes"):
        state = env.reset()
        state = process_state(state)
        state = np.array(state)
        episode_reward = 0
        for t in range(max_timesteps + 1):
            action = ppo.select_action(state)
            state, reward, done, _ = env.step(action[0])
            state = process_state(state)
            state = np.array(state)
            ppo.buffer.rewards.append(reward)
            ppo.buffer.is_terminals.append(done)
            episode_reward = episode_reward + reward
            if done:
                break
                    
        # Update PPO and record metrics
        average_loss, policy_loss, value_loss, policy_entropy = ppo.update()
        
        # Log metrics to TensorBoard
        writer.add_scalar("Average Loss", average_loss, episode)
        writer.add_scalar("Policy Entropy", policy_entropy, episode)
        writer.add_scalar("Value Loss", value_loss, episode)
        writer.add_scalar("Policy Loss", policy_loss, episode)
        writer.add_scalar("Episode Reward", episode_reward, episode)
        
        if episode % 10 == 0:
            print(f"Episode {episode} completed with reward: {episode_reward}")

    writer.close()

save_path = "Model_Checkpoints/Ji_dog_{}_Episode.pth".format('2.0')
print("save checkpoint path : " + save_path)
ppo.save(save_path)
print('Finished!')

Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable', '--no-window', '--/app/window/hideUi=1']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/run

Training Episodes:   0%|                    | 1/1000 [01:25<23:51:27, 85.97s/it]

Episode 0 completed with reward: 0.10673717461574717


Training Episodes:   1%|▏                  | 11/1000 [15:25<22:50:42, 83.16s/it]

Episode 10 completed with reward: 0.10673708692514494


Training Episodes:   2%|▍                  | 21/1000 [29:03<22:15:36, 81.86s/it]

Episode 20 completed with reward: 0.1067370769381989


Training Episodes:   3%|▌                  | 31/1000 [42:43<22:07:37, 82.21s/it]

Episode 30 completed with reward: 0.10673692716258798


Training Episodes:   4%|▊                  | 41/1000 [56:35<22:18:22, 83.74s/it]

Episode 40 completed with reward: 0.10673701936320346


Training Episodes:   5%|▊                | 51/1000 [1:11:35<24:44:20, 93.85s/it]

Episode 50 completed with reward: 0.10673704983409305


Training Episodes:   6%|█                | 61/1000 [1:27:12<24:39:56, 94.56s/it]

Episode 60 completed with reward: 0.10673714519715749


Training Episodes:   7%|█▏               | 71/1000 [1:43:05<24:36:03, 95.33s/it]

Episode 70 completed with reward: 0.10673699485442423


Training Episodes:   8%|█▍               | 81/1000 [1:58:50<23:56:45, 93.80s/it]

Episode 80 completed with reward: 0.10673705673446099


Training Episodes:   9%|█▌               | 91/1000 [2:14:22<23:31:32, 93.17s/it]

Episode 90 completed with reward: 0.10673698966698875


Training Episodes:  10%|█▌             | 101/1000 [2:31:00<26:24:44, 105.77s/it]

Episode 100 completed with reward: 0.10673696127800236


Training Episodes:  11%|█▋             | 111/1000 [2:49:15<26:45:38, 108.37s/it]

Episode 110 completed with reward: 0.10673710704188477


Training Episodes:  12%|█▉              | 121/1000 [3:04:50<23:02:32, 94.37s/it]

Episode 120 completed with reward: 0.10673711137843833


Training Episodes:  13%|██              | 131/1000 [3:20:38<23:02:24, 95.45s/it]

Episode 130 completed with reward: 0.10673704921500882


Training Episodes:  13%|██              | 132/1000 [3:22:14<23:01:55, 95.52s/it]

In [3]:
# Environment and simulation setup Change !!!
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": True})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create_v3 import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os

if __name__ == "__main__":
    # script_dir = os.getcwd()
    relative_path = "Model(including video)/ji_dog1.0.usd"
    # usd_path = os.path.join(script_dir, relative_path)
    env = Ji_Dog_Env(relative_path)
    state_dim = 20
    action_dim = 4
    max_training_timesteps = 10
    max_timesteps = 5
    ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

    # Initialize 
    writer = SummaryWriter(log_dir="runs/Ji_Dog_Training1")

    for episode in tqdm(range(max_training_timesteps), desc="Training Episodes"):
        total_rewards = {
            "distance_reward": 0,
            "fall_penalty": 0,
            "symmetry_reward": 0,
            "period_penalty": 0,
            "contact_penalty": 0,
            "smoothness_penalty": 0,
            "progress_reward":0,
            "mass_centre_reward":0,
            "stability_penalty":0,
        }
        
        state = env.reset()
        state = process_state(state)
        state = np.array(state)
        
        total_episode_reward = 0
        for t in range(max_timesteps + 1):
            action = ppo.select_action(state)
            state, total_reward, rewards, done, _ = env.step(action[0])

            state = process_state(state)
            state = np.array(state)
            ppo.buffer.rewards.append(total_reward)
            ppo.buffer.is_terminals.append(done)
            ppo.buffer.reward_contributions.append(rewards) 
            
            # reward, reward_contributions = env.calculate_reward()
            for key in rewards:
                total_rewards[key] += rewards[key]

            
            total_episode_reward += total_reward
            if done:
                break
                    
        # Update PPO and record metrics
        average_loss, policy_loss, value_loss, policy_entropy = ppo.update()
        
        # Log metrics to TensorBoard
        writer.add_scalar("Average Loss", average_loss, episode)
        writer.add_scalar("Policy Entropy", policy_entropy, episode)
        writer.add_scalar("Value Loss", value_loss, episode)
        writer.add_scalar("Policy Loss", policy_loss, episode)
        
        for key, value in total_rewards.items():
            writer.add_scalar(f"Rewards/{key}", value, episode)
    
        writer.add_scalar("Episode Total Reward", total_episode_reward, episode)
        if episode % 10 == 0:
            print(f"Episode {episode} completed with reward: {total_episode_reward}")
    
    writer.close()


save_path = "Model_Checkpoints/Ji_dog_{}_Episode.pth".format('2.0')
print("save checkpoint path : " + save_path)
ppo.save(save_path)
print('Finished!')



Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable', '--no-window', '--/app/window/hideUi=1']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/run

Training Episodes:   0%|                                 | 0/10 [00:00<?, ?it/s]

[4.947s] Simulation App Startup Complete


  return F.mse_loss(input, target, reduction=self.reduction)
Training Episodes:  10%|██▌                      | 1/10 [00:01<00:10,  1.21s/it]

Episode 0 completed with reward: 348.4023400408825


Training Episodes: 100%|████████████████████████| 10/10 [00:03<00:00,  2.64it/s]

save checkpoint path : Model_Checkpoints/Ji_dog_2.0_Episode.pth
Finished!





In [None]:
# Load the saved model
load_path = "Model_Checkpoints/Ji_dog_1.0_Episode.pth"
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
ppo.load(load_path)


env = Ji_Dog_Env('/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/Ji-dog 2.0/Model(including video)/ji_dog1.0.usd')
state_dim = 20
action_dim = 4
max_training_timesteps = 100
max_timesteps = 500
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

# Initialize 
writer = SummaryWriter(log_dir="runs/Ji_Dog_Training")


state = env.reset()
state = process_state(state)
state = np.array(state)

for t in range(max_timesteps + 1):
    action = ppo.select_action(state)
    state, reward, done, _ = env.step(action[0])
    state = process_state(state)
    state = np.array(state)
    ppo.buffer.rewards.append(reward)
    ppo.buffer.is_terminals.append(done)

    if done:
        break
            
# Update PPO and record metrics
average_loss, policy_loss, value_loss, policy_entropy = ppo.update()





In [None]:

simulation_app.close()


In [4]:
total_rewards

{'distance_reward': 299.99380533981275,
 'fall_penalty': 0,
 'symmetry_reward': -0.010883409633606789,
 'period_penalty': -6.0,
 'contact_penalty': -6,
 'smoothness_penalty': 0.0,
 'progress_reward': 0.404225423066364,
 'mass_centre_reward': 59.99712606587032,
 'stability_penalty': 0}