In [1]:
import numpy as np
import time
import os

import FleetRL
from FleetRL.fleet_env.fleet_environment import FleetEnv

from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import VecNormalize, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, ProgressBarCallback

from pink import PinkNoiseDist, PinkActionNoise

In [None]:
run_name = "td3_full_vecnorm_clip5"

In [None]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_TD3-{time_now}-{run_name}"
logs_dir = f"./logs/vec_TD3-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [2]:
n_cpu = 4
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "static_time_picker": False,
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0
                             })

In [3]:
train_norm_vec_env = VecNormalize(venv=train_vec_env, norm_obs=True, norm_reward=True, training=True, clip_reward=5)

In [4]:
eval_vec_env = make_vec_env(FleetEnv,
                            n_envs=1,
                            vec_env_cls=SubprocVecEnv,
                            env_kwargs={
                                "schedule_name": "lmd_sched_single.csv",
                                "building_name": "load_lmd.csv",
                                "include_building": True,
                                "include_pv": True,
                                "static_time_picker": False,
                                "deg_emp": False,
                                "include_price": True,
                                "ignore_price_reward": False,
                                "ignore_invalid_penalty": False,
                                "ignore_overcharging_penalty": False,
                                "ignore_overloading_penalty": False,
                                "episode_length": 48,
                                "normalize_in_env": False,
                                "verbose": 1
                            })

In [5]:
eval_norm_vec_env = VecNormalize(venv=eval_vec_env,
                                 norm_obs=True,
                                 norm_reward=True,
                                 training=True,
                                 clip_reward=5)

In [11]:
eval_callback = EvalCallback(eval_env=eval_norm_vec_env,
                             warn=True,
                             verbose=1,
                             deterministic=True,
                             eval_freq=max(5000 // n_cpu, 1),
                             n_eval_episodes=5,
                             render=False,
                             )

In [12]:
n_actions = train_norm_vec_env.action_space.shape[-1]
param_noise = None
noise_scale = 0.3
seq_len = 48 * 4
action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)

In [13]:
model = TD3(policy="MlpPolicy",
            env=train_norm_vec_env,
            verbose=0,
            train_freq=(1, "step"),
            learning_rate=0.005,
            learning_starts=20000,
            gamma=0.99,
            batch_size=128,
            buffer_size=500000,
            tensorboard_log="./tb_log",
            action_noise=action_noise
            )


In [None]:
model.learn(100, callback=eval_callback)

Invalid action, penalty given.
Timestep: 2020-02-08 10:15:00
Price: 0.02763 €/kWh
SOC: [0.], Time left: [0.] hours
Action taken: [0.1966399]
Actual charging energy: 0 kWh
Charging cost/revenue: 0.0 €
SoH: [1.]
--------------------------
Grid connection has been overloaded: 16.191453600000006 kW.
Reward signal: -0.9214378187311678
---------


Invalid action, penalty given.
Timestep: 2020-02-08 10:30:00
Price: 0.02763 €/kWh
SOC: [0.0], Time left: [0.] hours
Action taken: [0.19396913]
Actual charging energy: 0 kWh
Charging cost/revenue: 0.0 €
SoH: [1.]
--------------------------
Grid connection has been overloaded: 16.191453600000006 kW.
Reward signal: -0.9213334962426513
---------


Invalid action, penalty given.
Timestep: 2020-02-08 10:45:00
Price: 0.02763 €/kWh
SOC: [0.0], Time left: [0.] hours
Action taken: [0.19138634]
Actual charging energy: 0 kWh
Charging cost/revenue: 0.0 €
SoH: [1.]
--------------------------
Grid connection has been overloaded: 16.191453600000006 kW.
Reward sign