In [1]:
import torch
import gym

import numpy as np

# from ji_dog_net_v1 import PPO_Clip
# from ji_dog_net_v1 import ActorCritic_Clip
# from ji_dog_net_v1 import process_state

from ji_dog_net_v2 import PPO_Penalty
from ji_dog_net_v2 import ActorCritic_Penalty

from ji_dog_net_v3 import PPO_Clip
from ji_dog_net_v3 import ActorCritic_Clip
from ji_dog_net_v3 import process_state


## Check device

In [2]:
print("============================================================================================")
# set device to cpu or cuda
device = torch.device('cpu')
if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
print("============================================================================================")


Device set to : NVIDIA GeForce RTX 4060 Laptop GPU


## Test ACNet

In [3]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)


Action output: tensor([[ 0.9113, -0.1036,  0.4111,  1.4133]], device='cuda:0')
Action log probability: tensor([-5.8025], device='cuda:0')
State value output: tensor([-0.0248], device='cuda:0', grad_fn=<ViewBackward0>)


In [4]:
from torchviz import make_dot 
state_dim = 20  
action_dim = 4 
ppo = PPO_Penalty(state_dim, action_dim, lr_actor=0.0001, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2, kl_target=0.01, kl_penalty_coef=0.5, action_std_init=0.6)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0) 
ppo.buffer.is_terminals.append(False) 

Action output: tensor([[-0.6287, -0.7201, -0.1701, -0.4470]], device='cuda:0')
Action log probability: tensor([-3.2675], device='cuda:0')
State value output: tensor([0.0498], device='cuda:0', grad_fn=<ViewBackward0>)


## Train PPO

In [5]:
# Environment and simulation setup Change !!!
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": True})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create_v3 import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os

if __name__ == "__main__":
    # script_dir = os.getcwd()
    relative_path = "Model(including video)/ji_dog1.0.usd"
    # usd_path = os.path.join(script_dir, relative_path)
    env = Ji_Dog_Env(relative_path)
    state_dim = 20
    action_dim = 4
    max_training_timesteps = 400
    max_timesteps = 1000
    # eps_clip = max(initial_eps_clip * decay_factor, min_eps_clip)

    ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0001, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

    # Initialize 
    writer = SummaryWriter(log_dir="runs/Ji_Dog_Training4")

    for episode in tqdm(range(max_training_timesteps), desc="Training Episodes"):
        total_rewards = {
            "distance_reward": 0,
            "fall_penalty": 0,
            "symmetry_reward": 0,
            "period_penalty": 0,
            "contact_penalty": 0,
            "smoothness_penalty": 0,
            "progress_reward":0,
            "mass_centre_reward":0,
            "stability_penalty":0,
        }
        
        state = env.reset()
        state = process_state(state)
        state = np.array(state)
        
        total_episode_reward = 0
        for t in range(max_timesteps + 1):
            action = ppo.select_action(state)
            state, total_reward, rewards, done, _ = env.step(action[0])

            state = process_state(state)
            state = np.array(state)
            ppo.buffer.rewards.append(total_reward)
            ppo.buffer.is_terminals.append(done)
            ppo.buffer.reward_contributions.append(rewards) 
            
            # reward, reward_contributions = env.calculate_reward()
            for key in rewards:
                total_rewards[key] += rewards[key]

            
            total_episode_reward += total_reward
            if done:
                break
                    
        # Update PPO and record metrics
        average_loss, policy_loss, value_loss, policy_entropy = ppo.update()
        
        # Log metrics to TensorBoard
        writer.add_scalar("Average Loss", average_loss, episode)
        writer.add_scalar("Policy Entropy", policy_entropy, episode)
        writer.add_scalar("Value Loss", value_loss, episode)
        writer.add_scalar("Policy Loss", policy_loss, episode)
        
        for key, value in total_rewards.items():
            writer.add_scalar(f"Rewards/{key}", value, episode)
    
        writer.add_scalar("Episode Total Reward", total_episode_reward, episode)
        if episode % 10 == 0:
            print(f"Episode {episode} completed with reward: {total_episode_reward}")
    
    writer.close()


save_path = "Model_Checkpoints/Ji_dog_{}_Episode.pth".format('5.0')
print("save checkpoint path : " + save_path)
ppo.save(save_path)
print('Finished!')

simulation_app.close()

Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable', '--no-window', '--/app/window/hideUi=1']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/run

  return F.mse_loss(input, target, reduction=self.reduction)
Training Episodes:   0%|                      | 1/400 [00:18<2:01:09, 18.22s/it]

Episode 0 completed with reward: -1489.934117181719


Training Episodes:   3%|▌                    | 11/400 [03:13<1:56:43, 18.00s/it]

Episode 10 completed with reward: -699.6119692553793


Training Episodes:   5%|█                    | 21/400 [06:28<2:03:10, 19.50s/it]

Episode 20 completed with reward: -1065.4779202999928


Training Episodes:   8%|█▋                   | 31/400 [09:42<1:59:42, 19.46s/it]

Episode 30 completed with reward: -1325.2007310196352


Training Episodes:  10%|██▏                  | 41/400 [12:58<1:58:05, 19.74s/it]

Episode 40 completed with reward: -1421.6162689315574


Training Episodes:  13%|██▋                  | 51/400 [16:13<1:52:21, 19.32s/it]

Episode 50 completed with reward: -1851.8903247876387


Training Episodes:  15%|███▏                 | 61/400 [19:27<1:49:03, 19.30s/it]

Episode 60 completed with reward: -1511.5230072246552


Training Episodes:  18%|███▋                 | 71/400 [22:36<1:43:15, 18.83s/it]

Episode 70 completed with reward: -1649.5912095268236


Training Episodes:  20%|████▎                | 81/400 [25:43<1:39:46, 18.77s/it]

Episode 80 completed with reward: -1595.214042236409


Training Episodes:  23%|████▊                | 91/400 [28:55<1:40:49, 19.58s/it]

Episode 90 completed with reward: -1620.4223051759802


Training Episodes:  25%|█████               | 101/400 [32:12<1:41:27, 20.36s/it]

Episode 100 completed with reward: -1143.963842411203


Training Episodes:  28%|█████▌              | 111/400 [35:27<1:33:01, 19.31s/it]

Episode 110 completed with reward: -796.2408436366663


Training Episodes:  30%|██████              | 121/400 [38:38<1:29:17, 19.20s/it]

Episode 120 completed with reward: -925.2442288765424


Training Episodes:  33%|██████▌             | 131/400 [41:56<1:27:14, 19.46s/it]

Episode 130 completed with reward: -634.2625376599309


Training Episodes:  35%|███████             | 141/400 [45:04<1:21:06, 18.79s/it]

Episode 140 completed with reward: 302.68955297720566


Training Episodes:  38%|███████▌            | 151/400 [48:11<1:17:37, 18.70s/it]

Episode 150 completed with reward: -1093.3528478258095


Training Episodes:  40%|████████            | 161/400 [51:18<1:14:31, 18.71s/it]

Episode 160 completed with reward: -1711.8997903686775


Training Episodes:  43%|████████▌           | 171/400 [54:26<1:11:31, 18.74s/it]

Episode 170 completed with reward: -1131.3004395422142


Training Episodes:  45%|█████████           | 181/400 [57:33<1:08:16, 18.70s/it]

Episode 180 completed with reward: -1689.7022764377325


Training Episodes:  48%|████████▌         | 191/400 [1:00:41<1:05:21, 18.76s/it]

Episode 190 completed with reward: -1632.9139789078602


Training Episodes:  50%|█████████         | 201/400 [1:03:53<1:03:58, 19.29s/it]

Episode 200 completed with reward: -1786.6351358287914


Training Episodes:  53%|█████████▍        | 211/400 [1:07:05<1:01:10, 19.42s/it]

Episode 210 completed with reward: -1181.4975537486816


Training Episodes:  55%|███████████         | 221/400 [1:10:22<59:06, 19.81s/it]

Episode 220 completed with reward: -369.557518536807


Training Episodes:  58%|███████████▌        | 231/400 [1:13:36<55:06, 19.56s/it]

Episode 230 completed with reward: -1157.6236713334363


Training Episodes:  60%|████████████        | 241/400 [1:16:50<50:35, 19.09s/it]

Episode 240 completed with reward: -1663.4105549850326


Training Episodes:  63%|████████████▌       | 251/400 [1:20:06<48:16, 19.44s/it]

Episode 250 completed with reward: -1324.804225546881


Training Episodes:  65%|█████████████       | 261/400 [1:23:19<45:15, 19.54s/it]

Episode 260 completed with reward: -1123.3537067634309


Training Episodes:  68%|█████████████▌      | 271/400 [1:26:32<41:11, 19.16s/it]

Episode 270 completed with reward: -1866.4704414583032


Training Episodes:  70%|██████████████      | 281/400 [1:29:43<37:48, 19.06s/it]

Episode 280 completed with reward: -1430.7966722800804


Training Episodes:  73%|██████████████▌     | 291/400 [1:32:53<34:30, 19.00s/it]

Episode 290 completed with reward: -1734.397335800873


Training Episodes:  75%|███████████████     | 301/400 [1:36:08<32:56, 19.96s/it]

Episode 300 completed with reward: -1158.9638659200757


Training Episodes:  78%|███████████████▌    | 311/400 [1:39:24<28:58, 19.54s/it]

Episode 310 completed with reward: -1390.62470102092


Training Episodes:  80%|████████████████    | 321/400 [1:42:39<25:24, 19.30s/it]

Episode 320 completed with reward: -1489.294454614453


Training Episodes:  83%|████████████████▌   | 331/400 [1:45:53<22:19, 19.41s/it]

Episode 330 completed with reward: -969.9440770089


Training Episodes:  85%|█████████████████   | 341/400 [1:50:16<24:02, 24.46s/it]

Episode 340 completed with reward: -484.79987185705795


Training Episodes:  88%|█████████████████▌  | 351/400 [1:53:34<16:19, 20.00s/it]

Episode 350 completed with reward: -1377.2210230076857


Training Episodes:  90%|██████████████████  | 361/400 [1:56:56<13:05, 20.14s/it]

Episode 360 completed with reward: -253.3749096238221


Training Episodes:  93%|██████████████████▌ | 371/400 [2:00:09<09:11, 19.01s/it]

Episode 370 completed with reward: -1834.267925321462


Training Episodes:  95%|███████████████████ | 381/400 [2:03:22<06:03, 19.11s/it]

Episode 380 completed with reward: -586.4778612669479


Training Episodes:  98%|███████████████████▌| 391/400 [2:06:36<02:53, 19.23s/it]

Episode 390 completed with reward: -957.5662426231369


Training Episodes: 100%|████████████████████| 400/400 [2:09:30<00:00, 19.43s/it]

save checkpoint path : Model_Checkpoints/Ji_dog_5.0_Episode.pth
Finished!





In [5]:
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": False})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create_v3 import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os



# Load the saved model
load_path = "Model_Checkpoints/Ji_dog_5.0_Episode.pth"
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0001, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
ppo.load(load_path)


env = Ji_Dog_Env('Model(including video)/ji_dog1.0.usd')


state_dim = 20
action_dim = 4
max_training_timesteps = 100
max_timesteps = 500
# ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

# Initialize 

writer = SummaryWriter(log_dir="runs/Ji_Dog_Training")
    
total_rewards = {
    "distance_reward": 0,
    "fall_penalty": 0,
    "symmetry_reward": 0,
    "period_penalty": 0,
    "contact_penalty": 0,
    "smoothness_penalty": 0,
    "progress_reward":0,
    "mass_centre_reward":0,
    "stability_penalty":0,
}

state = env.reset()
state = process_state(state)
state = np.array(state)

total_episode_reward = 0
# for ep in range(1, total_test_episodes+1):    
for t in range(max_timesteps + 1):
    action = ppo.select_action(state)
    state, total_reward, rewards, done, _ = env.step(action[0])

    state = process_state(state)
    state = np.array(state)
    # ppo.buffer.rewards.append(total_reward)
    # ppo.buffer.is_terminals.append(done)
    # ppo.buffer.reward_contributions.append(rewards) 
    
    # reward, reward_contributions = env.calculate_reward()
    for key in rewards:
        total_rewards[key] += rewards[key]

    
    total_episode_reward += total_reward
    if done:
        break
ppo.buffer.clear()        
# Update PPO and record metrics
# average_loss, policy_loss, value_loss, policy_entropy = ppo.update()


simulation_app.close()


Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/runtime/kernel-a9f79c83-e385-45d6-b98b-c258b

  self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
  self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))


Robot articulation successfully loaded: <omni.isaac.core.robots.robot.Robot object at 0x7df5aa1f7100>


IndexError: index 3 is out of bounds for axis 1 with size 3

In [None]:

simulation_app.close()
