# Project by jack12

## 1. Import Dependencies

In [1]:
import os
import gym
from stable_baselines3 import PPO #algorithm
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Load Environment

In [2]:
env_name = 'CartPole-v1'
env = gym.make(env_name)

### Understanding the Environement

In [19]:
episodes = 3
for epi in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample() #random action among the action
        n_state, reward, done, info = env.step(action)
        score +=reward
#         print('score:{} done:{}'.format(score, done))
    print('Episode:{} Score:{}'.format(epi, score))
env.close()

TypeError: 'int' object is not subscriptable

In [4]:
env.close()

In [5]:
env.reset()

# output:>
# array([ 0.02893838, -0.00880777,  0.02856559, -0.04805397], dtype=float32)

array([-0.00322546, -0.03586276,  0.02441939,  0.04619027], dtype=float32)

In [6]:
env.observation_space
#  Box(4)
#  cart position
#  cart velocity
#  pole angle
#  pole angular velocity

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [7]:
env.step(1)
# (array([ 0.02876222,  0.18589318,  0.02760451, -0.3315891 ], dtype=float32),
#  1.0,
#  False, 
#  {}) 

(array([-0.00394271,  0.15890068,  0.02534319, -0.2386892 ], dtype=float32),
 1.0,
 False,
 {})

In [8]:
env.action_space
# output :>
#     Discrete(2)

Discrete(2)

## 3. Train the model

In [7]:
# save our tensorboard losgs 
log_path = os.path.join('Training', 'Logs')

In [10]:
log_path

'Training/Logs'

In [8]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [12]:
# PPO??

model.learn(total_timesteps=10000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 2266 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1547        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008348612 |
|    clip_fraction        | 0.0995      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00148    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 53.1        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7f764ae98bb0>

## Save and reload Model

In [13]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [14]:
model.save(PPO_Path)

In [18]:
del model

In [20]:
model = PPO.load(PPO_Path, env=env)

In [21]:
model.learn(total_timesteps=10000)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 2402 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1553         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0056019505 |
|    clip_fraction        | 0.0404       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.583       |
|    explained_variance   | 0.615        |
|    learning_rate        | 0.0003       |
|    loss                 | 10.1         |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.00838     |
|    value_loss           | 47.1         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f75bdd4df10>

## Evalutaion 

In [22]:
evaluate_policy(model, env, n_eval_episodes= 10 , render=True)



(500.0, 0.0)

In [23]:
env.close()

## Testing Model

In [23]:
episodes = 5 
for epi in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #action from trained model
        obs, reward, done, info = env.step(action)
        score +=reward
#         print('score:{} done:{}'.format(score, done))
    print('Episode:{} Score:{}'.format(epi, score))
env.close()

Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]
Episode:5 Score:[500.]


In [27]:
model.predict??

## Veiwing Logs in tensorboard

In [None]:
# tensorboard  --logdir=tensorboard  --logdir=.

## Adding a Calbacks 

In [3]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [5]:
save_path = os.path.join('Training', 'Saved Models')

In [21]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)
eval_callback = EvalCallback(env, 
                            callback_on_new_best=stop_callback, 
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose=1)

In [9]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [11]:
model.learn(total_timesteps=10000, callback=eval_callback)

Logging to Training/Logs/PPO_6
-----------------------------
| time/              |      |
|    fps             | 2388 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1596        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009448366 |
|    clip_fraction        | 0.0575      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.668      |
|    explained_variance   | 0.0995      |
|    learning_rate        | 0.0003      |
|    loss                 | 13.3        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 32.5        |
-----------------------------------------
---



Eval num_timesteps=7952, episode_reward=356.60 +/- 138.08
Episode length: 356.60 +/- 138.08
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 357         |
|    mean_reward          | 357         |
| time/                   |             |
|    total_timesteps      | 7952        |
| train/                  |             |
|    approx_kl            | 0.008099372 |
|    clip_fraction        | 0.072       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.603      |
|    explained_variance   | 0.343       |
|    learning_rate        | 0.0003      |
|    loss                 | 26          |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0161     |
|    value_loss           | 60.6        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 356.60  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x7fb0b2d9cf70>

## changing policies

In [14]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [15]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, b={'net_arch':net_arch})

Using cpu device


In [22]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_9
-----------------------------
| time/              |      |
|    fps             | 1989 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1316         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0060782903 |
|    clip_fraction        | 0.0863       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.492       |
|    explained_variance   | 0.982        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.322        |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.00305     |
|    value_loss           | 2.88         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7fb0b0122ca0>

## ALternate ALgorithm

In [24]:
from stable_baselines3 import DQN

In [26]:
dqn_model = DQN('MlpPolicy', env, verbose=1 , tensorboard_log=log_path)

Using cpu device


In [27]:
dqn_model.learn(total_timesteps=10000)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.949    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2322     |
|    time_elapsed     | 0        |
|    total_timesteps  | 54       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.852    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3791     |
|    time_elapsed     | 0        |
|    total_timesteps  | 156      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.77     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4681     |
|    time_elapsed     | 0        |
|    total_timesteps  | 242      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 8892     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2433     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 8944     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2525     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 8989     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2605     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 9555     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4940     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 9575     |
|    time_elapsed     | 0        |
|    total_timesteps  | 5044     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 9602     |
|    time_elapsed     | 0        |
|    total_timesteps  | 5207     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 9611     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7287     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 9609     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7374     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 9585     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7474     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 9374     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9657     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 9352     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9728     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 9356     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9857     |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x7fb07c39a2e0>

In [None]:
# DQN.load()
