- Published model can be found at https://huggingface.co/Battu007/V4_PPO2_LunarLander_v2

In [1]:
#Import required libraries
import gym
import random
import time

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

from huggingface_sb3 import package_to_hub
from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login

In [2]:
#Choose environment name
env_name = 'LunarLander-v2'

- https://github.com/huggingface/deep-rl-class
- https://huggingface.co/spaces/ThomasSimonini/Lunar-Lander-Leaderboard
- https://huggingface.co/spaces/chrisjay/Deep-Reinforcement-Learning-Leaderboard

In [26]:
#Run env for X steps to check it out

#Create a new environment
env = gym.make(env_name)

#Reset environment
observation = env.reset()

for t in range(150):
    env.render()           #Render env for every step
    time.sleep(0.01)       #Slow down the simulation to view
    action = env.action_space.sample()     #Give a random action

    #Get back observation, reward,, simulation end flag and info
    observation, reward, done, info = env.step(action)
env.close()


In [12]:
print(env_name , "Action Space: ", env.action_space)
print(env_name, "Observation Space ", env.observation_space)

LunarLander-v2 Action Space:  Discrete(4)
LunarLander-v2 Observation Space  Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)


- Action Space
    - There are four discrete actions available: do nothing, fire left
    orientation engine, fire main engine, fire right orientation engine.
- Observation Space
    - There are 8 states: the coordinates of the lander in `x` & `y`, its linear
    velocities in `x` & `y`, its angle, its angular velocity, and two booleans
    that represent whether each leg is in contact with the ground or not.

#### Choosing model
- From SB3 implementations, possible models include:
    - ARS, A2C, DQN, PPO, TRPO (Chosen to test out)

#### PPO Implementation (MLP Policy)

In [6]:
model_PPO = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
#Train model on env
model_PPO.learn(total_timesteps = 500000)

In [9]:
#Evaluate Model
eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model_PPO, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=226.69 +/- 25.86449951482193


In [11]:
model_PPO.save("PPO_500K_Lunar_Lander")

In [13]:
#Run trained agent for X episodes to view performance
episodes = 5

for ep in range(episodes):
    obs = env.reset()
    dones = False
    print("Current episode: ", ep+1)
    while not dones:
        action, _states = model_PPO.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        time.sleep(0.01)
env.close()

Current episode:  1
Current episode:  2
Current episode:  3
Current episode:  4
Current episode:  5
Current episode:  6
Current episode:  7
Current episode:  8
Current episode:  9
Current episode:  10


#### A2C Implementation

In [20]:
model_A2C = A2C(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 5,
    gamma = 0.99,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    learning_rate = 0.001,
    verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
#Train model on env
model_A2C.learn(total_timesteps = 500000)

In [23]:
#Evaluate Model
eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model_A2C, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=-12.14 +/- 71.02340042939036


In [24]:
#Run trained agent for X episodes to view performance
episodes = 5

for ep in range(episodes):
    obs = env.reset()
    dones = False
    print("Current episode: ", ep+1)
    while not dones:
        action, _states = model_A2C.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        time.sleep(0.01)
env.close()

Current episode:  1
Current episode:  2
Current episode:  3
Current episode:  4
Current episode:  5


In [25]:
model_A2C.save("A2C_500K_Lunar_Lander")

#### DQN Implementation

In [8]:
model_DQN = DQN(
    policy = 'MlpPolicy',
    env = env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
#Train model on env
model_DQN.learn(total_timesteps = 1000000)

In [10]:
#Evaluate Model
eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model_DQN, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=-41.80 +/- 71.52575431835412


In [11]:
#Run trained agent for X episodes to view performance
episodes = 5

for ep in range(episodes):
    obs = env.reset()
    dones = False
    print("Current episode: ", ep+1)
    while not dones:
        action, _states = model_DQN.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        time.sleep(0.01)
env.close()

Current episode:  1
Current episode:  2
Current episode:  3
Current episode:  4
Current episode:  5


In [33]:
model_DQN.save("DQN_500K_Lunar_Lander")

#### PPO 2 different hyperparameters

In [4]:
model_PPO2 = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 32,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
#Train model on env
model_PPO2.learn(total_timesteps = 2e6)

In [39]:
#Evaluate Model
eval_env = DummyVecEnv([lambda: gym.make(env_name)])

# eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model_PPO2, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=275.96 +/- 21.646267077384305


In [27]:
#Run trained agent for X episodes to view performance
episodes = 5

for ep in range(episodes):
    obs = env.reset()
    dones = False
    print("Current episode: ", ep+1)
    while not dones:
        action, _states = model_PPO2.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        time.sleep(0.01)
env.close()

Current episode:  1
Current episode:  2
Current episode:  3
Current episode:  4
Current episode:  5


In [9]:
#Save model
model_PPO2.save("PPO2_5M_Lunar_Lander")

In [9]:
#Loading Previous saved model
model_PPO2 = PPO.load("PPO2_5M_Lunar_Lander", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Publishing to Hugging Face

In [None]:
env_id = env_name
model_architecture = "PPO"

repo_id = "Battu007/'___'"

commit_message = "PPO Hyperparemeter tune 2M steps LL-2 agent -- Wrapped in DummyVecEnv"

eval_env = DummyVecEnv([lambda: gym.make(env_name)])

package_to_hub(model=model_PPO2, # Our trained model
               model_name="V4_PPO_LL", # The name of our trained model 
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message,
               token = '')
