# Training RL agents with ErNESTO-gym

The notebooks provides a simple template to train and save the models of renowned RL agents.<br>
Refer to the `testing.ipynb` notebook to test these models and compare them with deterministic strategies.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%cd ../..

C:\Users\samue\PycharmProjects\ErNESTO-gym


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from tqdm import tqdm
from gymnasium.utils.env_checker import check_env

from ernestogym.envs.single_agent.env_different_actions import MicroGridEnvDifferentAction
from ernestogym.envs.single_agent.utils import parameter_generator

sns.set_style('darkgrid')
plot_colors = sns.color_palette()
sns.set(font_scale=1.5)

In [3]:
def plot_rewards(timestamps: list, res: dict, reward_type='weighted_reward', sampling_rate=10):
    fig, ((ax1), (ax2), (ax3)) = plt.subplots(3, 1, figsize=(12, 9), tight_layout=True)
    
    trad_list = res[reward_type]['r_trad']   
    op_list = res[reward_type]['r_op']     
    clip_list = res[reward_type]['r_clip']     
    
    ax1.plot(timestamps[::sampling_rate], trad_list[::sampling_rate])
    ax1.set(xlabel='Samples', ylabel='R_trad')         

    ax2.plot(timestamps[::sampling_rate], op_list[::sampling_rate])
    ax2.set(xlabel='Samples', ylabel='R_op') 

    ax3.plot(timestamps[::sampling_rate], clip_list[::sampling_rate])
    ax3.set(xlabel='Samples', ylabel='R_clip')        

def plot_cum_rewards(timestamps: list, res: dict, reward_type='weighted_reward', sampling_rate=10):
    fig, ((ax1), (ax2), (ax3)) = plt.subplots(3, 1, figsize=(12, 9), tight_layout=True)
    
    trad_list = np.cumsum(res[reward_type]['r_trad'])
    op_list = np.cumsum(res[reward_type]['r_op'])
    clip_list = np.cumsum(res[reward_type]['r_clip'])     
    
    ax1.plot(timestamps[::sampling_rate], trad_list[::sampling_rate])
    ax1.set(xlabel='Samples', ylabel='R_trad')         

    ax2.plot(timestamps[::sampling_rate], op_list[::sampling_rate])
    ax2.set(xlabel='Samples', ylabel='R_op') 

    ax3.plot(timestamps[::sampling_rate], clip_list[::sampling_rate])
    ax3.set(xlabel='Samples', ylabel='R_clip')        

## Environment Information

In [4]:
pack_options = "ernestogym/ernesto/data/battery/pack.yaml"
# ecm = "ernestogym/ernesto/data/battery/models/electrical/thevenin_pack.yaml"
ecm = "ernestogym/ernesto/data/battery/models/electrical/thevenin_fading_pack.yaml"
r2c = "ernestogym/ernesto/data/battery/models/thermal/r2c_thermal_pack.yaml"
bolun = "ernestogym/ernesto/data/battery/models/aging/bolun_pack.yaml"
# world = "ernestogym/envs/single_agent/world_deg.yaml"
world = "ernestogym/envs/single_agent/world_fading.yaml"

params = parameter_generator(
    battery_options=pack_options,
    electrical_model=ecm,
    thermal_model=r2c,
    aging_model=bolun,
    world_options=world,
    use_reward_normalization=True
    
)

In [5]:
# Create environment
env = MicroGridEnvDifferentAction(settings=params)

print('Size of State Space: ', env.observation_space.shape)
print('Observation Space: ', env.spaces.keys())
print('Size of Action Space: ', env.action_space.shape)
print('Min action: ', env.action_space.low)
print('Max action: ', env.action_space.high)
print('Sample State: ', env.observation_space.sample())
print('Sample Action: ', env.action_space.sample())

Size of State Space:  (10,)
Observation Space:  odict_keys(['temperature', 'soc', 'demand', 'generation', 'ask', 'bid', 'sin_day_of_year', 'cos_day_of_year', 'sin_seconds_of_day', 'cos_seconds_of_day'])
Size of Action Space:  (1,)
Min action:  [-2160.]
Max action:  [2160.]
Sample State:  [ 3.6537601e+02  5.0020391e-01  1.1295903e+00  6.2193835e-01
  2.3328312e-02  1.2579015e+00 -8.9241490e-02  4.4242114e-01
 -4.0517825e-01 -6.2309557e-01]
Sample Action:  [-314.18103]


## Experiment settings

In [6]:
#num_steps = len(env.demand)
num_steps = 100000

## PPO Agent

In [7]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.callbacks import CheckpointCallback

In [8]:
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
    save_freq=num_steps,
    save_path="./examples/single_agent/models/",
    name_prefix="ppo_different_action",
    )

In [9]:
model = PPO(MlpPolicy, env, verbose=False, gamma=0.9)

model.learn(total_timesteps=num_steps,
            progress_bar=True, 
            reset_num_timesteps=False,
            callback=[checkpoint_callback],
            )

profile:  52


<stable_baselines3.ppo.ppo.PPO at 0x1935ce0cef0>

## A2C Agent

In [10]:
from stable_baselines3 import A2C
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.callbacks import CheckpointCallback

In [11]:
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
    save_freq=num_steps,
    save_path="./examples/single_agent/models/",
    name_prefix="a2c_different_action",
    )

In [12]:
model = A2C(MlpPolicy, env, verbose=False, gamma=0.9)

model.learn(total_timesteps=num_steps,
            progress_bar=True, 
            reset_num_timesteps=False,
            callback=[checkpoint_callback],
            )

profile:  5


<stable_baselines3.a2c.a2c.A2C at 0x19394eed130>