In [None]:
# References:
# [1] https://www.youtube.com/watch?v=Mut_u40Sqz4
# [2] https://github.com/ARISE-Initiative/robosuite/issues/131

# Required imports

In [1]:
# python
import os
# openai 
import gym
# Stable baseline imports
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
# Robosuite imports
import robosuite as suite
from robosuite.wrappers import GymWrapper
from robosuite.environments.base import register_env
from robosuite import load_controller_config

# Environment

In [None]:
# Notice how the environment is wrapped by the wrapper
env = GymWrapper(
    suite.make(
                "TwoArmPegInHole",
                robots=["Panda", "Panda"],  # use Sawyer robot
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=True,  # use dense rewards
                control_freq=20,  # control should happen fast enough so that simulation looks smooth
                horizon = 200,   
                )
            )

In [None]:
env.close()

# Environment Specification

In [None]:
print(f"Initial State observations: \n {env.reset()}")
print(f"Action space: \n{env.action_space}")
print(f"Observation space: \n{env.observation_space}")

In [None]:
env.close()

# Random Action: No Training

In [None]:
n_episodes = 1
for i_episode in range(n_episodes):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        # env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        score += reward
    print(f"Episode: {i_episode} Score: {score}")

In [None]:
env.close()

# (NOT WORKING AS EXPECTED) Train using Stable baselines3 PPO without vectorized environment

In [None]:
# create directory to store training logs


# tb_log_dir = os.path.join('../../Training', 'tb_log')
# print(tb_log_dir)
# monitor_log_dir = os.path.join('../../Training', 'monitor_log')
# print(monitor_log_dir)

In [None]:
# env = wrap_env(env)orizon of 
# Note: n_steps = n * horizon


# model = PPO('MlpPolicy', env, n_steps=10, verbose=2, tensorboard_log=log_dir)

In [None]:
# model.learn(total_timesteps=4, tb_log_name="TwoArmPegInHole_PPO_TEST")

In [None]:
# env.close()

In [None]:
# save_model_path = os.path.join('../Training', 'Saved_Models', 'TwoArmPegInHole_PPO_model')
# save_model_path

In [None]:
# model.save(save_model_path)
# del model

# Interfacing Stable Baseline3 with basic cartpole environment to figure out the issue

In [None]:
# crete gym env
env = gym.make("CartPole-v0")

# create PPO model
# Note: n_steps here represents the horizon of each episode. For CartPole-v0, the horizon is 200 steps
model = PPO("MlpPolicy", env, n_steps=200, verbose=1)
model.learn(total_timesteps=400)
model.save("ppo_cartpole")

del model # remove to demonstrate saving and loading

model = PPO.load("ppo_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

> Note: There is some mis alignment with n_step for robosuite environments. n_steps is not equal to single episode horizon but it is `n` times `single episode horizon in robosuite`. So n_steps = 10 means 10* 200 (i.e. horizon). Somehow we need to find another alternative to get the rollouts not every step but every horizon.

> TODO: 
[] To look into EvalCallback <https://github.com/ludvikka/temp_oj/blob/4c4bf1dde764c0241a7d2080d484937c4584e185/code/rl_training.py#L15>
[] To look into robosuite benchmark repository