## Main Course

  Get the code for this tutorial: https://github.com/nicknochnack/ReinforcementLearningCourse.git

  Watch the video: https://www.youtube.com/watch?v=Mut_u40Sqz4&list=WL&index=8&t=141s

  Note: The code isn't exact the same as the video or repo because the code needs to update and debug. And if you run the code in Colab, it requires extra actions. 

### 1. Import dependencies
  Documentation: https://stable-baselines3.readthedocs.io/en/master/


In [3]:
!pip install stable-baselines3



In [4]:
import os # to search the path
#import gym # for openAI gym --> old version
import gymnasium as gym
from stable_baselines3 import PPO # one of algorithms (see the document above)
from stable_baselines3.common.vec_env import DummyVecEnv # train models in multiple environment at same time; speed up training
from stable_baselines3.common.evaluation import evaluate_policy # test model's performance

### 2. Load Environment (Cartpole as example here)

The main environment functions are:
1. env.reset( ) - reset the environment and obtain initial oberservations
2. env.render( ) - visualise the environment
3. env.step( ) - apply an action to the environment
4. env.close( ) - close down the render frame

In [6]:
# create environment
environment_name = "CartPole-v1"
env = gym.make(environment_name)

In [13]:
# solution: 
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() # Observations for the environment not just for the pole.
    # we will deliver these observations to reinforcement learning agent to learn the optimal value.
    done = False
    truncated = False
    score = 0

    while not (done or truncated):
        env.render()
        #action = random.choice([0,1])
        action = env.action_space.sample() # random action
        n_state, reward, done, truncated, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()


Episode:1 Score:24.0
Episode:2 Score:10.0
Episode:3 Score:25.0
Episode:4 Score:17.0
Episode:5 Score:21.0


In [10]:
# test environment: episodes is like one full game within the environment. Same environments
# have a fixed episode length e.g. cartpole wich is 200 frames. Others are continuous,
# e.g. Breakout, play until you run out of lives.
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() # Observations for the environment not just for the pole.
    # we will deliver these observations to reinforcement learning agent to learn the optimal value.
    done = False
    score = 0

    while not done:
        env.render() # it doesn't work with colab
        # env.render(mode='rgb_array')
        action = env.action_space.sample() # random action
        n_state, reward, done, info = env.step(action) # pass random action
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()


ValueError: too many values to unpack (expected 4)

In [14]:
env.action_space # the action you can take in the environment.

Discrete(2)

Understanding The Environment

In [15]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

0

In [16]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([ 4.629841  ,  0.42401126, -0.30284554, -0.8044943 ], dtype=float32)

In [17]:
# (4,) is from env.reset
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

### 3. Train an RL Model

In [27]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1)

Using cpu device


In [28]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 2549 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1704        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008285886 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00544    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0172     |
|    value_loss           | 57.7        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1b7f5f0a3d0>

### 4. Save and Reload Model

In [29]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [30]:
print(PPO_path)

Training\Saved Models\PPO_model


In [31]:
model.save(PPO_path)

In [23]:
del model

In [33]:
load_model = PPO.load('Training\Saved Models\PPO_model', env=env)

### 5. Evaluation

In [34]:
from stable_baselines3.common.evaluation import evaluate_policy

In [35]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(500.0, 0.0)

In [36]:
env.close()

### 6. Test Model

In [37]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done:
        print('info', info)
        break

info [{'TimeLimit.truncated': True, 'terminal_observation': array([-0.2466684 , -0.05472602,  0.04111669,  0.3380593 ], dtype=float32)}]


In [38]:
env.close()

### 7. Viewing Logs in Tensorboard

In [39]:
training_log_path = os.path.join(log_path, 'PPO_3')

NameError: name 'log_path' is not defined

In [40]:
!tensorboard --logdir={training_log_path}

^C



### 8. Adding a callback to the training Stage

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [None]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [None]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
model_path = os.path.join('Training', 'Saved Models', 'best_model')
model = PPO.load(model_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

### 9. Changing Policies

In [None]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

### 10. Using an Alternate Algorithm

In [None]:
from stable_baselines3 import DQN

In [None]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [None]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

## Project 1 - Breakout

### 1. Import Dependencies

In [None]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

### 2. Test Environment

In [None]:
environment_name = "Breakout-v0"

In [None]:
env = gym.make(environment_name)

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

### 3. Vectorise Environment and Train Model

In [None]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)

In [None]:
env = VecFrameStack(env, n_stack=4)

In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=400000)

### 4. Save and Reload Model

In [None]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_model')

In [None]:
model.save(a2c_path)

In [None]:
del model

In [None]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [None]:
model = A2C.load(a2c_path, env)

### 5. Evaluate and Test

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
env.close()

## Project 2 - Self Driving

### 1. Import Dependencies

In [None]:
#Install SWIG https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering

In [None]:
!pip install gym[box2d] pyglet==1.3.2

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import os

### 2. Test Environment


In [None]:
environment_name = "CarRacing-v0"

In [None]:
env = gym.make(environment_name)

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

### 3. Train Model

In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=40000)

### 4. Save Model

In [None]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_Driving_model')

In [None]:
model.save(ppo_path)

### 5. Evaluate and Test

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
env.close()

## Project 3 - Custom Environment

https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering


### 1. Import Dependencies

In [None]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

### 2. Types of Spaces

In [None]:
Discrete(3)

In [None]:
Box(0,1,shape=(3,3)).sample()

In [None]:
Box(0,255,shape=(3,3), dtype=int).sample()

In [None]:
Tuple((Discrete(2), Box(0,100, shape=(1,)))).sample()

In [None]:
Dict({'height':Discrete(2), "speed":Box(0,100, shape=(1,))}).sample()

In [None]:
MultiBinary(4).sample()

In [None]:
MultiDiscrete([5,2,2]).sample()

### 3. Building an Environment

In [None]:
class ShowerEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Temperature array
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start temp
        self.state = 38 + random.randint(-3,3)
        # Set shower length
        self.shower_length = 60

    def step(self, action):
        # Apply action
        # 0 -1 = -1 temperature
        # 1 -1 = 0
        # 2 -1 = 1 temperature
        self.state += action -1
        # Reduce shower length by 1 second
        self.shower_length -= 1

        # Calculate reward
        if self.state >=37 and self.state <=39:
            reward =1
        else:
            reward = -1

        # Check if shower is done
        if self.shower_length <= 0:
            done = True
        else:
            done = False

        # Apply temperature noise
        #self.state += random.randint(-1,1)
        # Set placeholder for info
        info = {}

        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = np.array([38 + random.randint(-3,3)]).astype(float)
        # Reset shower time
        self.shower_length = 60
        return self.state

In [None]:
env=ShowerEnv()

In [None]:
env.observation_space.sample()

In [None]:
env.reset()

In [None]:
from stable_baselines3.common.env_checker import check_env

In [None]:
check_env(env, warn=True)

### 4. Test Environment

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

### 5. Train Model

In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=400000)

### 6. Save Model

In [None]:
model.save('PPO')

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)