## Import Dependencies

In [5]:
import os
import gym
import random
import numpy as np

from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common. evaluation import evaluate_policy

## Types of Spaces

In [7]:
# all the different spaces inside OpenAi

In [9]:
Discrete(3)

Discrete(3)

In [11]:
Discrete(3).sample()

2

In [12]:
Box(0,1,shape=(3,3)).sample()

array([[0.4917503 , 0.29081815, 0.91662395],
       [0.24472067, 0.17147069, 0.13561754],
       [0.48898572, 0.19548678, 0.3137968 ]], dtype=float32)

In [15]:
Box(0,255,shape=(3,3),dtype=int).sample()

array([[163, 127,  53],
       [233,  93, 235],
       [234,  88,  77]])

In [19]:
Tuple((Discrete(2),Box(0,100,shape=(1,1)))).sample()

(1, array([[0.1608658]], dtype=float32))

In [21]:
Dict({"height":Discrete(2),"speed":Box(0,100,shape=(1,))}).sample()



OrderedDict([('height', 1), ('speed', array([17.900782], dtype=float32))])

In [24]:
MultiBinary(4).sample()

array([1, 1, 0, 1], dtype=int8)

In [25]:
MultiDiscrete([5,2,2]).sample()

array([3, 1, 1])

## Building an Environment

In [26]:
"""class showerEnv(Env):
    def __init__(self):
        pass
    def step(self, action):
        pass
    def render(self):
        pass
    def reset(self):
        pass"""

'class showerEnv(Env):\n    def __init__(self):\n        pass\n    def step(self, action):\n        pass\n    def render(self):\n        pass\n    def reset(self):\n        pass'

In [30]:
class showerEnv(Env):
    
    def __init__(self):
        self.action_space=Discrete(3)
        self.observation_space=Box(low=0,high=100, shape=(1,))
        self.state=38+random.randint(-3,3)
        self.shower_length=60
        
    def step(self, action):
        self.state+=action-1
        self.shower_length-=1
        
        if self.state>=37 and self.state<=39:
            reward=1
        else:
            reward=-1
        
        if self.shower_length<=0:
            done=True
        else:
            done=False
            
        info={}
        
        return self.state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state=np.array([38+random.randint(-3,3)]).astype(float)
        self.shower_length=60
        return self.state

In [31]:
env=showerEnv()

In [34]:
env.action_space.sample()

0

In [35]:
env.observation_space.sample()

array([14.922016], dtype=float32)

In [42]:
env.reset()

array([39.])

In [43]:
from stable_baselines3.common.env_checker import check_env

In [44]:
check_env(env, warn=True)

AssertionError: The observation returned by the `reset()` method does not match the given observation space

## Test Environment

In [45]:
episodes=5

for episode in range(1, episodes+1):
    state=env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action=env.action_space.sample()
        n_state, reward, done, info=env.step(action)
        score+=reward
    print(f"Episode :{episode} Score : {score}")

env.close()

Episode :1 Score : -28
Episode :2 Score : -60
Episode :3 Score : -8
Episode :4 Score : 22
Episode :5 Score : -50


## Train Model

In [49]:
log_path=os.path.join("Training","Logs","CustomEnv")

In [47]:
model=PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [48]:
model.learn(total_timesteps=10000)

Logging to Training/Logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -29.6    |
| time/              |          |
|    fps             | 190      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 60        |
|    ep_rew_mean          | -30.9     |
| time/                   |           |
|    fps                  | 330       |
|    iterations           | 2         |
|    time_elapsed         | 12        |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0106878 |
|    clip_fraction        | 0.0816    |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.09     |
|    explained_variance   | -0.00026  |
|    learning_rate        | 0

<stable_baselines3.ppo.ppo.PPO at 0x12b04fa00>

## Save Model

In [50]:
model.save(log_path)

In [52]:
evaluate_policy(model, env, n_eval_episodes=10,render=False)

(-57.2, 2.2271057451320084)

In [53]:
# del model
# model= PPO.load(log_path,env)