In [1]:
import torch
import gym

import numpy as np

# from ji_dog_net_v1 import PPO_Clip
# from ji_dog_net_v1 import ActorCritic_Clip
# from ji_dog_net_v1 import process_state

from ji_dog_net_v2 import PPO_Penalty
from ji_dog_net_v2 import ActorCritic_Penalty

from ji_dog_net_v3 import PPO_Clip
from ji_dog_net_v3 import ActorCritic_Clip
from ji_dog_net_v3 import process_state


## Check device

In [2]:
print("============================================================================================")
# set device to cpu or cuda
device = torch.device('cpu')
if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
print("============================================================================================")


Device set to : NVIDIA GeForce RTX 4060 Laptop GPU


## Test ACNet

In [3]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)


Action output: tensor([[-1.0820,  1.0640, -0.1664,  0.0594]], device='cuda:0')
Action log probability: tensor([-4.9043], device='cuda:0')
State value output: tensor([0.0492], device='cuda:0', grad_fn=<ViewBackward0>)


In [4]:
from torchviz import make_dot
state_dim = 20  
action_dim = 4 
ppo = PPO_Penalty(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2, kl_target=0.01, kl_penalty_coef=0.5, action_std_init=0.6)
state = torch.rand((state_dim,)).to(device)  
action, action_logprob, hidden_actor = ppo.policy.act(state)

print("Action output:", action)
print("Action log probability:", action_logprob)

state_value = ppo.policy.evaluate_critic(state)

print("State value output:", state_value)

ppo.buffer.rewards.append(1.0)
ppo.buffer.is_terminals.append(False)

Action output: tensor([[ 0.7156, -1.7127, -0.0953, -0.1270]], device='cuda:0')
Action log probability: tensor([-6.2792], device='cuda:0')
State value output: tensor([-0.0212], device='cuda:0', grad_fn=<ViewBackward0>)


## Test PPO _ v1

In [5]:
# from isaacsim import SimulationApp
# simulation_app = SimulationApp({"headless": False})

# from torch.utils.tensorboard import SummaryWriter

# from ji_dog_env_create import Ji_Dog_Env
# from tqdm import tqdm  
# import numpy as np
# import os


# # Load the saved model
# load_path = "Model_Checkpoints/Ji_dog_1.0_Episode.pth"
# ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
# ppo.load(load_path)


# env = Ji_Dog_Env('Model(including video)/ji_dog1.0.usd')
# state_dim = 20
# action_dim = 4
# max_training_timesteps = 100
# max_timesteps = 500
# ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

# # Initialize 
# writer = SummaryWriter(log_dir="runs/Ji_Dog_Training")


# state = env.reset()
# state = process_state(state)
# state = np.array(state)

# for t in range(max_timesteps + 1):
#     action = ppo.select_action(state)
#     state, reward, done, _ = env.step(action[0])
#     state = process_state(state)
#     state = np.array(state)
#     ppo.buffer.rewards.append(reward)
#     ppo.buffer.is_terminals.append(done)

#     if done:
#         break
# ppo.buffer.clear()                    
# # Update PPO and record metrics
# average_loss, policy_loss, value_loss, policy_entropy = ppo.update()




In [None]:

simulation_app.close()


## Test PPO _ v3

In [None]:
from isaacsim import SimulationApp
simulation_app = SimulationApp({"headless": False})

from torch.utils.tensorboard import SummaryWriter

from ji_dog_env_create_v3 import Ji_Dog_Env
from tqdm import tqdm  
import numpy as np
import os



# Load the saved model
load_path = "Model_Checkpoints/Ji_dog_4.0_Episode.pth"
ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)
ppo.load(load_path)


env = Ji_Dog_Env('Model(including video)/ji_dog1.0.usd')


state_dim = 20
action_dim = 4
max_training_timesteps = 100
max_timesteps = 5000
# ppo = PPO_Clip(state_dim, action_dim, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, K_epochs=4, eps_clip=0.2)

# Initialize 

writer = SummaryWriter(log_dir="runs/Ji_Dog_Training")
    
total_rewards = {
    "distance_reward": 0,
    "fall_penalty": 0,
    "symmetry_reward": 0,
    "period_penalty": 0,
    "contact_penalty": 0,
    "smoothness_penalty": 0,
    "progress_reward":0,
    "mass_centre_reward":0,
    "stability_penalty":0,
}

state = env.reset()
state = process_state(state)
state = np.array(state)

total_episode_reward = 0
# for ep in range(1, total_test_episodes+1):    
for t in range(max_timesteps + 1):
    action = ppo.select_action(state)
    state, total_reward, rewards, done, _ = env.step(action[0])

    state = process_state(state)
    state = np.array(state)
    # ppo.buffer.rewards.append(total_reward)
    # ppo.buffer.is_terminals.append(done)
    # ppo.buffer.reward_contributions.append(rewards) 
    
    # reward, reward_contributions = env.calculate_reward()
    for key in rewards:
        total_rewards[key] += rewards[key]

    
    total_episode_reward += total_reward
    if done:
        break
ppo.buffer.clear()        
# Update PPO and record metrics
# average_loss, policy_loss, value_loss, policy_entropy = ppo.update()


simulation_app.close()



Starting kit application with the following args:  ['/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps/omni.isaac.sim.python.kit', '--/app/tokens/exe-path=/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/kit', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=True', '--/app/fastShutdown=False', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/exts', '--ext-folder', '/home/bai/.local/share/ov/pkg/isaac-sim-4.2.0/apps', '--/physics/cudaDevice=0', '--portable']
Passing the following args to the base kit application:  ['-f', '/home/bai/.local/share/jupyter/runtime/kernel-0ced6b96-6d02-4806-b483-22db3

  self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
  self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))


Robot articulation successfully loaded: <omni.isaac.core.robots.robot.Robot object at 0x74547fde2fe0>
