In [1]:
import standup_env
import gymnasium as gym
from tqdm import tqdm
from stable_baselines3 import PPO, DQN, A2C, DDPG
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv
import shutil
from typing import Callable

name = 'standup_agent'
env_name = "standup_env/StandUp-v1"
max_episode_steps = 200

agent = PPO
policy = 'MlpPolicy'
dir = f"./{name}"
tensorboard_log = f"./{name}/t_logs/"
best_model_path = f"./{name}/model/best_model.zip"
model_path = f"./{name}/model/model.zip"
best_model_save_path = f"./{name}/model/"
log_path = f"./{name}/logs/"
device = 'cpu'

# num_action = 20
# keys = [i for i in range(num_action)]
# values = [(i/num_action)*2 - 1 for i in range(num_action)]
# disc_to_cont = dict(zip(keys, values))
def make_env(render_mode = None):
    env = gym.make(
        env_name,
        render_mode = render_mode,
        skip_frame = 10,
        max_episode_steps=max_episode_steps,
        debug_mode = False
        )
    
    #env = DiscreteActions(env, disc_to_cont)
    #env = TimeAwareObservation(env, normalize_time=True)
    return env



# Test Env

In [1]:
# Execute Setup
import nbformat
from IPython import get_ipython
with open("01_standup.ipynb", "r", encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)
for cell in notebook.cells:
    if "tags" in cell.metadata and "setup" in cell.metadata.tags:
        exec(cell.source)

env = make_env(render_mode='human')

env.reset()

terminated = False
i=0
while not terminated:
    action = env.action_space.sample()
    obs, rew, terminated, truncated, info = env.step(action)

    # Print the tobservations, reward
    print("Step:", i)
    print("Action:", action)
    print("Observations:", obs)
    print("Reward:", rew)
    print("Info", info)
    if terminated or truncated:
        break
    i+=1

env.close()


render human


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Step: 0
Action: [-0.83568066 -0.05639767  0.9413816   0.05706795 -0.9350656  -0.30451325
  0.7901161  -0.5145137  -0.5306194  -0.260115   -0.81540984 -0.7905143 ]
Observations: [ 0.    0.   -1.    0.    0.   -1.    0.    0.   -1.    0.    0.   -1.
  0.15  0.  ]
Reward: -0.125
Info {'reward:': -0.125, 'angle_reward': 0.5, 'z_position_reward': 0.375}
Step: 1
Action: [ 0.4408463  -0.33787316  0.18298551  0.9845838  -0.04923519  0.2866405
 -0.41073093 -0.1175947   0.22800392 -0.6251257   0.61446273  0.55848175]
Observations: [-0.08356807 -0.00563977 -0.90586184  0.00570679 -0.09350656 -1.
  0.07901161 -0.05145137 -1.         -0.0260115  -0.08154098 -1.
  0.02639597  0.00203186]
Reward: -0.44416937323560024
Info {'reward:': -0.44416937323560024, 'angle_reward': 0.4898406911420201, 'z_position_reward': 0.06598993562237959}
Step: 2
Action: [-0.07589445 -0.00952268 -0.14967984  0.36333537  0.7685577   0.45650908
 -0.4897668  -0.06565165  0.13162467 -0.09120943  0.5308131   0.6147827 ]
Observat

KeyboardInterrupt: 

# Create agent

In [None]:
# Execute Setup
import nbformat
from IPython import get_ipython
with open("01_standup.ipynb", "r", encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)
for cell in notebook.cells:
    if "tags" in cell.metadata and "setup" in cell.metadata.tags:
        exec(cell.source)

# Linear Schedule
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    def func(progress_remaining: float) -> float:
        return progress_remaining * initial_value
    return func

# Create model
env = make_env()


if agent is DQN:
    model = DQN(
        policy,
        env,
        verbose=0,
        device=device,
        tensorboard_log=tensorboard_log,
        exploration_fraction=0.5,
        learning_rate=linear_schedule(0.0001)
    )
    
elif agent is DDPG:
    model = DDPG(
        policy,
        env,
        verbose=0,
        device=device,
        tensorboard_log=tensorboard_log,
        learning_rate=linear_schedule(0.0001)
    )
    
elif agent is A2C:
    model = A2C(
        policy,
        env,
        verbose=0,
        device=device,
        tensorboard_log=tensorboard_log,    
    )
    
elif agent is PPO:
    model = PPO(
        policy,
        env,
        verbose=0,
        device=device,
        tensorboard_log=tensorboard_log,    
    )

# Save
shutil.rmtree(dir, ignore_errors=True)
model.save(model_path)

In [None]:
# Execute Setup
import nbformat
from IPython import get_ipython
with open("01_standup.ipynb", "r", encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)
for cell in notebook.cells:
    if "tags" in cell.metadata and "setup" in cell.metadata.tags:
        exec(cell.source)

print(f"Start training with {agent}")
# Env and model
train_env = Monitor(make_env())
eval_env = Monitor(make_env())

model = agent.load(model_path, train_env, device)

# Callbacks
eval_callback = EvalCallback(
    eval_env,
    eval_freq=1e4,
    deterministic=True,
    n_eval_episodes=10,
    best_model_save_path=best_model_save_path,
)

checkpoint_callback = CheckpointCallback(
    1e4,
    best_model_save_path,
    name_prefix="checkpoint"
)

# Training
model.learn(
    total_timesteps=1e6,
    progress_bar=True,
    reset_num_timesteps=False,
    
    callback=[
        eval_callback,
        checkpoint_callback
    ]
)

# Save and close
model.save(model_path)
train_env.close()