INSTALL INDEPENDICIES

In [None]:
%pip install stable-baselines3[extra]

In [1]:
import os
import numpy
import gymnasium as gym
import stable_baselines3
from stable_baselines3 import PPO #PPo is the algorithm we are gonna use here
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import pygame



Load Env

In [2]:
environment_name='CartPole-v1'
env=gym.make(environment_name,render_mode="human")

In [3]:

episodes=5
for episode in range (1,episodes+1):
    state=env.reset()
    done=False
    score=0
    while not done:
        env.render()
        action=env.action_space.sample()
        n_state, reward, done, any, info = env.step(action)
        score += reward
    print('Episode:{} Score{}'.format(episode,score))
env.close()        

Episode:1 Score21.0
Episode:2 Score29.0
Episode:3 Score11.0
Episode:4 Score13.0
Episode:5 Score28.0


Understand the env
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [4]:
env.action_space

Discrete(2)

In [5]:
env.action_space.sample()

1

In [6]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [7]:
env.observation_space.sample()

array([6.4249378e-01, 2.4406969e+38, 7.7392913e-02, 2.9447467e+38],
      dtype=float32)

Train the RL model

In [8]:
#make your directory manually first
log_path=os.path.join('Training','Logs')

In [9]:
log_path

'Training\\Logs'

In [None]:
env=gym.make(environment_name)
env=DummyVecEnv([lambda:env])
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

You might need to install tensorBoard using 
 conda install -c conda-forge tensorboard

In [None]:
model.learn(total_timesteps=20000)


Save and reload the model

In [10]:
PPo_Path= os.path.join('Training','Saved Models','PPO_Model_CartPole')

In [None]:
model.save(PPo_Path)

In [None]:
del model

In [11]:
PPo_Path

'Training\\Saved Models\\PPO_Model_CartPole'

In [12]:
model=PPO.load(PPo_Path,env=env)

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
model.learn(total_timesteps=1000)
  

Evaluation

In [None]:
evaluate_policy(model=model, env=env,n_eval_episodes=10)

In [None]:
env.close()

200 or higher is solved

Test

In [None]:
env

In [13]:
env=gym.make(environment_name)
env=DummyVecEnv([lambda:env])

In [14]:
episodes=5
for episode in range (1,episodes+1):
    obs=env.reset()
    done=False
    score=0
    while not done:
        env.render()
        action, _ = model.predict(obs)# using model here
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score{}'.format(episode,score))
env.close()        



Episode:1 Score[500.]
Episode:2 Score[500.]
Episode:3 Score[500.]
Episode:4 Score[500.]
Episode:5 Score[500.]


In [None]:
env.close()

In [15]:
obs=env.reset()
obs

array([[-0.02880716,  0.00031636,  0.04785826, -0.03926323]],
      dtype=float32)

Model.predict(obs)
will give us the output based on the current state not just random as the space sample

In [16]:
action, _=model.predict(obs)
action

array([1], dtype=int64)

The reward is 

In [17]:
env.step(action)

(array([[-0.02880083,  0.19472052,  0.04707299, -0.31647062]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

array[1] the 1 is that we are still in good condition so the cart isn't falling down 

Viewing Logs in TensorBoard

In [18]:
training_log_path=os.path.join(log_path,'PPO_2')

In [19]:
training_log_path

'Training\\Logs\\PPO_2'

In [None]:
!tensorboard --logdir={training_log_path} # if you wanna run from the notebook

First conda install -c conda-forge werkzeug
Then go to the path you need to view manually and open the cmd make sure to use the conda env and type " tensorboard --logdir=." Then open the link provided to see

Applying CallBacks (reward threshold)

In [25]:
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold

In [26]:
save_path=os.path.join('Training','Saved Models')

In [27]:
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [29]:
stop_callback =StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback=EvalCallback(env,callback_on_new_best=stop_callback,eval_freq=10000,best_model_save_path=save_path,verbose=1)

In [30]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 1582 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 951         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008919975 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -1.79e-05   |
|    learning_rate        | 0.0003      |
|    loss                 | 6.15        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0169     |
|    value_loss           | 52.3        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=355.80 +/- 130.07
Episode length: 355.80 +/- 130.07
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 356         |
|    mean_reward          | 356         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.010408685 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.597      |
|    explained_variance   | 0.278       |
|    learning_rate        | 0.0003      |
|    loss                 | 25.6        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0213     |
|    value_loss           | 64.3        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 355.80  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x1a270b48210>

Change Policy

In [33]:
net_arch=dict(pi=[128,128,128,128],vf=[128,128,128,128])

In [34]:
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})

Using cpu device


In [35]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 804  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 521         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015644675 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.0146      |
|    learning_rate        | 0.0003      |
|    loss                 | 1.79        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0251     |
|    value_loss           | 17.9        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=377.00 +/- 125.81
Episode length: 377.00 +/- 125.81
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 377         |
|    mean_reward          | 377         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.010646982 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.562      |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.0003      |
|    loss                 | 16.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 43.9        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 377.00  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x1a270b16d50>

Using alternative Algorithm

In [36]:
from stable_baselines3 import DQN

In [37]:
model=DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [38]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.952    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8338     |
|    time_elapsed     | 0        |
|    total_timesteps  | 100      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.92     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8905     |
|    time_elapsed     | 0        |
|    total_timesteps  | 169      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.884    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 9435     |
|    time_elapsed     | 0        |
|    total_timesteps  | 245      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x1a270bc33d0>

In [39]:
DQN.load

<bound method BaseAlgorithm.load of <class 'stable_baselines3.dqn.dqn.DQN'>>