In [None]:
# make sure python version <= 3.8.x
!pip install -q torch torchvision torchaudio
!pip install -q stable-baselines3[extra]
!pip install -q pyglet==1.3.2
# !choco install swig  # MUST install swig. User apt install for linux based kernel
!pip install gym[atari,box2d]==0.17.3
# !pip install ale-py
!pip install tensorboard

# Import Dependencies

In [1]:
import os
import gym 
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Script

In [2]:
def run_env(env, model, episodes=5, render=False):
    for episode in range(1, episodes+1):
        obs = env.reset()
        done = False
        score = 0 
        while not done:
            action, _state = (env.action_space.sample(), None) if model is None else model.predict(obs)
            obs, reward, done, _info = env.step(action)
            score+=reward
            if render:
                env.render()
        print('Episode:{} Score:{}'.format(episode, score))
    env.close()

# CartPole
ref: [link](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py)

## env

In [4]:
env = gym.make("CartPole-v0")
print(env.action_space)  # 2 acts - left, right
print(env.observation_space)  # 4 obs - cart position, cart valocity, pole angle, pole angular velocity
# reward max = episode termination = 200 obs

Discrete(2)
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [5]:
run_env(env, None, episodes=5, render=True)

Episode:1 Score:35.0
Episode:2 Score:18.0
Episode:3 Score:11.0
Episode:4 Score:13.0
Episode:5 Score:24.0


## modeling

In [21]:
env = gym.make("CartPole-v0")
env = DummyVecEnv([lambda: env])
log_path = os.path.join('logs', "cartpole")
model = PPO('MlpPolicy', env, verbose = True, tensorboard_log=log_path)

Using cpu device


In [11]:
save_path = os.path.join('pretrained_weights', 'cartpole')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [6]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to logs\PPO_2




Eval num_timesteps=1000, episode_reward=193.40 +/- 8.14
Episode length: 193.40 +/- 8.14
---------------------------------
| eval/              |          |
|    mean_ep_length  | 193      |
|    mean_reward     | 193      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=169.80 +/- 17.52
Episode length: 169.80 +/- 17.52
---------------------------------
| eval/              |          |
|    mean_ep_length  | 170      |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 273  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=3000, episode_reward=146.20 +/- 44.76
Episode length: 146.20 +/- 44.76
------------------------

<stable_baselines3.ppo.ppo.PPO at 0x14e684be610>

In [15]:
model.learn(total_timesteps=20000)

Logging to logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 1114 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 971         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011044263 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | 0.000885    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.79        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0173     |
|    value_loss           | 41.3        |
-----------------------------------------
------------

<stable_baselines3.ppo.ppo.PPO at 0x1e5ff908610>

## export & import

In [8]:
checkpoint = os.path.join('pretrained_weights', 'cartpole', 'cartpole_ppo_20k')

In [26]:
model.save(checkpoint)

In [9]:
# del model
model = PPO.load(checkpoint, env=env)

## eval

In [20]:
evaluate_policy(model, env, n_eval_episodes=1, render=True, warn=False, return_episode_rewards=True)
env.close()

## test

In [16]:
run_env(env, model, episodes=2, render=True)
# GOOD JOB!

Episode:1 Score:[200.]
Episode:2 Score:[200.]


In [22]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [30]:
!python -m tensorboard.main --logdir={training_log_path}  --port 6006
# http://localhost:6006/

# avg reward, episode length, 

^C


# Breakout

link: [ref](https://gym.openai.com/envs/Breakout-v0/); [rom](http://www.atarimania.com/roms/Roms.rar)

extract `ROMS` and `HC ROMS` in `Roms.rar`

## env

In [None]:
!python -m atari_py.import_roms .\rom\ROMS
# if has error in win10, follow https://github.com/openai/gym/issues/1726#issuecomment-550580367

In [14]:
env = gym.make("Breakout-v0")
print(env.action_space)  # 4 acts
print(env.observation_space)  # min val, max val, (heightm, width, 3 channels)

Discrete(4)
Box(0, 255, (210, 160, 3), uint8)


In [15]:
run_env(env, None, episodes=5, render=True)

Episode:1 Score:1.0
Episode:2 Score:2.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:4.0


## modeling

In [4]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)
log_path = os.path.join("logs", "breakout")
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [5]:
model.learn(total_timesteps=10000)

Logging to logs\breakout\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 281      |
|    ep_rew_mean        | 1.56     |
| time/                 |          |
|    fps                | 56       |
|    iterations         | 100      |
|    time_elapsed       | 35       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.34    |
|    explained_variance | 0.0706   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0549   |
|    value_loss         | 0.162    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 281      |
|    ep_rew_mean        | 1.53     |
| time/                 |          |
|    fps                | 58       |
|    iterations         | 200      |
|    time_elapsed       | 68       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x1893eea1a00>

In [None]:
model.learn(total_timesteps=2000000)

## export & import

In [6]:
checkpoint = os.path.join('pretrained_weights', 'breakout', 'breakout_a2c_10k')

In [7]:
model.save(checkpoint)



In [None]:
# del model
model = A2C.load(checkpoint, env=env)

## eval

In [None]:
evaluate_policy(model, env, n_eval_episodes=3, render=True, warn=False, return_episode_rewards=True)
env.close()

## test

In [5]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [6]:
checkpoint = os.path.join('pretrained_weights', 'breakout', 'breakout_a2c_2m')
model = A2C.load(checkpoint, env=env)

Wrapping the env in a VecTransposeImage.




In [11]:
run_env(env, model, episodes=5, render=True)
# GOOD JOB!

Episode:1 Score:[0.]
Episode:2 Score:[0.]
Episode:3 Score:[7.]
Episode:4 Score:[0.]
Episode:5 Score:[4.]


# CarRacing
link: [ref](https://gym.openai.com/envs/CarRacing-v0/)

## env

In [3]:
env = gym.make("CarRacing-v0")
print(env.action_space)  # 3 acts between -1 to 1
print(env.observation_space)  # min val, max val, (heightm, width, 3 channels)

Box(-1.0, 1.0, (3,), float32)
Box(0, 255, (96, 96, 3), uint8)




In [6]:
run_env(env, None, episodes=1, render=True)

Track generation: 1188..1489 -> 301-tiles track
Episode:1 Score:-33.33333333333387


## modeling

In [4]:
env = gym.make("CarRacing-v0")
env = DummyVecEnv([lambda: env])
log_path = os.path.join('logs', 'carracing')
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [5]:
model.learn(total_timesteps=10000)

Track generation: 1067..1338 -> 271-tiles track
Logging to logs\carracing\PPO_1
Track generation: 1292..1619 -> 327-tiles track
Track generation: 1139..1428 -> 289-tiles track
-----------------------------
| time/              |      |
|    fps             | 90   |
|    iterations      | 1    |
|    time_elapsed    | 22   |
|    total_timesteps | 2048 |
-----------------------------
Track generation: 1163..1458 -> 295-tiles track
Track generation: 988..1239 -> 251-tiles track
-----------------------------------------
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 2           |
|    time_elapsed         | 70          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008518707 |
|    clip_fraction        | 0.0797      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.22       |
|    explained_variance   | -0.00457    |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x24b7f04fc10>

## export & import

In [6]:
checkpoint = os.path.join('pretrained_weights', 'carracing', 'carracing_ppo_10k')

In [7]:
model.save(checkpoint)



In [None]:
# del model
model = A2C.load(checkpoint, env=env)

## eval

In [8]:
evaluate_policy(model, env, n_eval_episodes=1, render=True, warn=False, return_episode_rewards=True)
env.close()

Track generation: 1169..1465 -> 296-tiles track
Track generation: 1183..1483 -> 300-tiles track


## test

In [9]:
checkpoint = os.path.join('pretrained_weights', 'carracing', 'carracing_ppo_2m')
model = A2C.load(checkpoint, env=env)



Wrapping the env in a VecTransposeImage.


In [10]:
run_env(env, model, episodes=3, render=True)
# GOOD JOB!

Track generation: 1285..1610 -> 325-tiles track
Track generation: 1155..1448 -> 293-tiles track
Episode:1 Score:[736.426]
Track generation: 1063..1333 -> 270-tiles track
Track generation: 1126..1416 -> 290-tiles track
Episode:2 Score:[821.9406]
Track generation: 1117..1400 -> 283-tiles track
Track generation: 1251..1568 -> 317-tiles track
Episode:3 Score:[811.3566]


# Custom - GuessNum
ans is 42

In [3]:
import os
import random
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

In [4]:
print(Discrete(3))
print(Box(0,1,shape=(3,3)).sample())
print(Box(0,255,shape=(3,3), dtype=int).sample())
print(Tuple((Discrete(2), Box(0,100, shape=(1,)))).sample())
print(Dict({'height':Discrete(2), "speed":Box(0,100, shape=(1,))}).sample())
print(MultiBinary(4).sample())
print(MultiDiscrete([5,2,2]).sample())  # 0 to X

Discrete(3)
[[0.05858376 0.7883015  0.8729452 ]
 [0.9673796  0.35261858 0.72823906]
 [0.9608749  0.43137282 0.29857707]]
[[ 31 172 216]
 [ 82  86  13]
 [164 142 229]]
(1, array([56.542503], dtype=float32))
OrderedDict([('height', 0), ('speed', array([61.57991], dtype=float32))])
[0 1 1 0]
[1 0 0]


## env

In [5]:
class GuessNumEnv(Env):  
    def __init__(self):
        self._ans = 42
        self._init_state = np.array(self._ans+10 + random.randint(-5,5))
        self._thresh = 3

        self.action_space = Discrete(3)  # decrease, stay, increase
        self.observation_space = Box(0, 100, shape=(1, ), dtype=int)
        self.state = self._init_state
        self.guess_length = 60
       
    def step(self, action):
        self.state += (action -1) * 3  # apply action to state
        self.guess_length -= 1  # track length
        
        # Calculate reward
        if self.state >=self._ans-self._thresh and self.state <=self._ans+self._thresh: 
            reward =1 
        else: 
            reward = -1 
        
        # Check if guess is done
        done = True if self.guess_length <= 0 else False
        
        # Apply noise
        self.state += random.randint(-1,1)

        # placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # placeholder for visualization
        pass
    
    def reset(self):
        # Reset guess number 
        self.state = np.array([self._init_state])
        # Reset guess time
        self.guess_length = 60 
        return self.state

In [11]:
env=GuessNumEnv()
check_env(env, warn=True)
print(env.reset())

[51]


In [17]:
run_env(env, None, episodes=5, render=False)

Episode:1 Score:-60
Episode:2 Score:-56
Episode:3 Score:-60
Episode:4 Score:-54
Episode:5 Score:-48


## modeling

In [18]:
log_path = os.path.join('logs', 'guessnum')
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
save_path = os.path.join('pretrained_weights', 'guessnum')
# stop_callback = StopTrainingOnRewardThreshold(reward_threshold=50, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=None, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [20]:
model.learn(total_timesteps=100000) # , callback=eval_callback)

Logging to logs\guessnum\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -43.6    |
| time/              |          |
|    fps             | 318      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -41.3        |
| time/                   |              |
|    fps                  | 470          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0136798145 |
|    clip_fraction        | 0.187        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   |

<stable_baselines3.ppo.ppo.PPO at 0x25d4cddda00>

## export & import

In [21]:
checkpoint = os.path.join('pretrained_weights', 'guessnum', 'guessnum_ppo_100k')

In [22]:
model.save(checkpoint)

In [None]:
# del model
model = PPO.load(checkpoint, env=env)

## eval

In [23]:
evaluate_policy(model, env, n_eval_episodes=1, render=False, warn=False, return_episode_rewards=True)
env.close()

## test

In [24]:
checkpoint = os.path.join('pretrained_weights', 'guessnum', 'best_model')
model = PPO.load(checkpoint, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [37]:
run_env(env, model, episodes=3, render=True)
# Mehhh not so good at guessing... very inconsistent too.

Episode:1 Score:-16
Episode:2 Score:20
Episode:3 Score:-6
