## Import Dependencies ##

In [83]:
#Import GYM dependencies
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

#Import Helpers
import numpy as np
import random
import os

#Import stable baselines 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env

## Types of Spaces ##

In [11]:
Discrete(3).sample()

0

In [18]:
Box(0,1, shape=(3,)).sample()

array([0.61207813, 0.73225003, 0.0434901 ], dtype=float32)

In [20]:
Tuple((Discrete(3), Box(0,1, shape=(3,)))).sample()

(1, array([0.91728324, 0.47266385, 0.53501207], dtype=float32))

In [21]:
Dict({'height':Discrete(2), "speed":Box(0,100,shape=(1,))})

Dict('height': Discrete(2), 'speed': Box(0.0, 100.0, (1,), float32))

In [27]:
MultiBinary(4).sample()

array([0, 0, 0, 0], dtype=int8)

In [40]:
MultiDiscrete([5,2,2]).sample()

array([4, 0, 1], dtype=int64)

## Building an Environment ##
- Build an agent to give us the best shower possible
- Randomly temperature
- 37 and 39 degrees

In [96]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=0, high=100, shape=(1,), dtype=np.float32)
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60
        
    def step(self, action):
        # Apply temperature adjustment
        self.state += action - 1

        # Decrease shower time
        self.shower_length -= 1

        # Calculate reward
        if 37 <= self.state <= 39:
            reward = 1
        else:
            reward = -1

        # Check if the episode is done
        terminated = self.shower_length <= 0
        truncated = False  # Assuming no truncation condition in this simple environment
        
        info = {}

        return np.array([self.state], dtype=np.float32), reward, terminated, truncated, info
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60
        return np.array([self.state], dtype=np.float32), {}

    def render(self):
        pass
        

In [113]:
env = ShowerEnv()
env = Monitor(env)

In [74]:
env.observation_space.sample()

array([54.22644], dtype=float32)

In [75]:
env.action_space.sample()

2

In [100]:
env.reset()

(array([37.], dtype=float32), {})

In [101]:
# Check your custom environment
check_env(env)

## Test Environment ##

In [102]:
episodes = 5
for episode in range(1,episodes+1):
    obs, info = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        score += reward

        done = terminated or truncated
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-52
Episode:2 Score:-58
Episode:3 Score:2
Episode:4 Score:-60
Episode:5 Score:-46


## Train Model ##

In [103]:
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [107]:
model.learn(total_timesteps=40000)

Logging to Training\Logs\PPO_37


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 0.235    |
| time/              |          |
|    fps             | 1368     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -0.176       |
| time/                   |              |
|    fps                  | 711          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0015372569 |
|    clip_fraction        | 0.0125       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.456       |
|    explained_variance   | 0.0192       |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x26f593f6110>

## Save Model ##

In [108]:
shower_path = os.path.join('Training', 'Saved Models', 'PPO_Shower_Model')


In [109]:
model.save(shower_path)

In [110]:
del model

In [116]:
model = PPO.load(shower_path, env)

Wrapping the env in a DummyVecEnv.


In [120]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(0.0, 60.0)