In [23]:
### import dependencies
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# import helpers
import numpy as np
import random
import os

# import stable baselines stuff
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [13]:
## types of spaces
Discrete(3).sample()

2

In [16]:
Box(0,1, shape=(3,3))
Box(0,1, shape=(3,3)).sample()

array([[0.24552457, 0.21218556, 0.68271774],
       [0.1819015 , 0.16484764, 0.15580866],
       [0.53887105, 0.5947308 , 0.24188179]], dtype=float32)

In [17]:
Tuple((Discrete(2), Box(0,1, shape=(3,))))

Tuple(Discrete(2), Box(0.0, 1.0, (3,), float32))

In [20]:
Dict({'height':Discrete(2), "speed":Box(0,100, shape=(3,))})

Dict({'height':Discrete(2), "speed":Box(0,100, shape=(3,))}).sample()

{'height': 1, 'speed': array([90.51504, 27.16145, 65.57342], dtype=float32)}

In [22]:
MultiBinary(4)
MultiBinary(4).sample()

array([1, 1, 1, 1], dtype=int8)

In [41]:
MultiDiscrete([5,2,1]).sample()

array([2, 1, 0])

In [42]:
### Building an environment
# build an agent to give us the best shower possible
# Randomly temperature
# Optimal temp: 37 and 39 deg


In [None]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        self.state = 38 + random.randint(-3,3)
        self.shower_length = 60
        

    def step(self, action):
        self.state += action -1

        # decrease the shower time
        self.shower_length -= 1

        if self.state >= 37 and self.state <=39:
            reward = 1
        else:
            reward = -1

        if self.shower_length <=0:
            done = True
        else:
            done = False

        info = {}
        return self.state, reward, done, False, info
        
    def render(self):
        pass
        
    def reset(self, seed=None, options=None):
        self.state = np.array([38+random.randint(-3,3)]).astype(float)
        self.shower_length = 60
        info = {}
        return self.state, info

In [None]:
env = ShowerEnv()

In [None]:
env.observation_space

In [None]:
env.action_space

In [None]:
# testing the environment out
episodes = 5

for episode in range(1, episodes + 1):

    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done,_, _ = env.step(action)
        score += reward
    print(f"Episode: {episode} Score: {score}")
env.close()

In [93]:
### training

model = PPO('MlpPolicy', env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [95]:
model.learn(total_timesteps=40000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -25.4    |
| time/              |          |
|    fps             | 1228     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 60         |
|    ep_rew_mean          | -27.4      |
| time/                   |            |
|    fps                  | 810        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01124642 |
|    clip_fraction        | 0.025      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.1       |
|    explained_variance   | 4.12e-05   |
|    learning_rate        | 0.0003     |
|   

<stable_baselines3.ppo.ppo.PPO at 0x70e353b01040>

In [96]:
# evaluate the model
evaluate_policy(model, env, n_eval_episodes=10, render=False)



(59.2, 0.9797958971132712)