### Simple script with two goals
1.) Basic exploration of City Learn env

2.) Create wrapper so taht City Learn env works with rllib: https://docs.ray.io/en/latest/index.html


#### Basic exploration of City Learn env

In [3]:
# copied from old version of local_evaluate.py
    # https://gitlab.aicrowd.com/aicrowd/challenges/citylearn-challenge-2022/citylearn-2022-starter-kit/-/blob/master/local_evaluation.py
# just seeing the env basics

import numpy as np
import time

"""
Please do not make changes to this file. 
This is only a reference script provided to allow you 
to do local evaluation. The evaluator **DOES NOT** 
use this script for orchestrating the evaluations. 
"""

from agents.orderenforcingwrapper import OrderEnforcingAgent
from citylearn.citylearn import CityLearnEnv

class Constants:
    episodes = 3
    schema_path = './data/citylearn_challenge_2022_phase_1/schema.json'

def action_space_to_dict(aspace):
    """ Only for box space """
    return { "high": aspace.high,
             "low": aspace.low,
             "shape": aspace.shape,
             "dtype": str(aspace.dtype)
    }

def env_reset(env):
    observations = env.reset()
    action_space = env.action_space
    observation_space = env.observation_space
    building_info = env.get_building_information()
    building_info = list(building_info.values())
    action_space_dicts = [action_space_to_dict(asp) for asp in action_space]
    observation_space_dicts = [action_space_to_dict(osp) for osp in observation_space]
    obs_dict = {"action_space": action_space_dicts,
                "observation_space": observation_space_dicts,
                "building_info": building_info,
                "observation": observations }
    return obs_dict

# def evaluate():
print("Starting local evaluation")

env = CityLearnEnv(schema=Constants.schema_path)
agent = OrderEnforcingAgent()

obs_dict = env_reset(env)

agent_time_elapsed = 0

step_start = time.perf_counter()
actions = agent.register_reset(obs_dict)
agent_time_elapsed += time.perf_counter()- step_start

episodes_completed = 0
num_steps = 0
interrupted = False
episode_metrics = []

try:
    while True:

        ### This is only a reference script provided to allow you 
        ### to do local evaluation. The evaluator **DOES NOT** 
        ### use this script for orchestrating the evaluations. 

        #observations, _, done, _ = env.step(actions)
        # edit: seeing what reward and info does, and adding a break
        observations, reward, done, info = env.step(actions)
        break
        if done:
            episodes_completed += 1
            metrics_t = env.evaluate()
            metrics = {"price_cost": metrics_t[0], "emmision_cost": metrics_t[1]}
            if np.any(np.isnan(metrics_t)):
                raise ValueError("Episode metrics are nan, please contant organizers")
            episode_metrics.append(metrics)
            print(f"Episode complete: {episodes_completed} | Latest episode metrics: {metrics}", )

            obs_dict = env_reset(env)

            step_start = time.perf_counter()
            actions = agent.register_reset(obs_dict)
            agent_time_elapsed += time.perf_counter()- step_start
        else:
            step_start = time.perf_counter()
            actions = agent.compute_action(observations)
            agent_time_elapsed += time.perf_counter()- step_start

        num_steps += 1
        if num_steps % 1000 == 0:
            print(f"Num Steps: {num_steps}, Num episodes: {episodes_completed}")

        if episodes_completed >= Constants.episodes:
            break
except KeyboardInterrupt:
    print("========================= Stopping Evaluation =========================")
    interrupted = True

if not interrupted:
    print("=========================Completed=========================")

if len(episode_metrics) > 0:
    print("Average Price Cost:", np.mean([e['price_cost'] for e in episode_metrics]))
    print("Average Emmision Cost:", np.mean([e['emmision_cost'] for e in episode_metrics]))
print(f"Total time taken by agent: {agent_time_elapsed}s")


# evaluate()


Starting local evaluation
Total time taken by agent: 0.00045527098700404167s


In [6]:
# env basics
#observations, reward, done, info = env.step(actions)
print("-----reset is ")
print(obs_dict)
print("-----actions look like ", actions)
print("-----reward look like ", reward)
print("-----done look like ", done)
print("-----info look like ", info)
print("observations look like ", observations)

-----reset is 
{'action_space': [{'high': array([1.], dtype=float32), 'low': array([-1.], dtype=float32), 'shape': (1,), 'dtype': 'float32'}, {'high': array([1.], dtype=float32), 'low': array([-1.], dtype=float32), 'shape': (1,), 'dtype': 'float32'}, {'high': array([1.], dtype=float32), 'low': array([-1.], dtype=float32), 'shape': (1,), 'dtype': 'float32'}, {'high': array([1.], dtype=float32), 'low': array([-1.], dtype=float32), 'shape': (1,), 'dtype': 'float32'}, {'high': array([1.], dtype=float32), 'low': array([-1.], dtype=float32), 'shape': (1,), 'dtype': 'float32'}], 'observation_space': [{'high': array([  13.       ,    8.       ,   25.       ,   33.2      ,
         33.2      ,   33.2      ,   33.2      ,  101.       ,
        101.       ,  101.       ,  101.       , 1018.       ,
       1018.       , 1018.       , 1018.       ,  954.       ,
        954.       ,  954.       ,  954.       ,    1.2817962,
          8.987483 ,  977.25     ,    2.       ,  966.23175  ,
          1.

##### Notes
* Reward is a list, one reward for each building. Typically reward is a single scalar.
* Done and info are as expected
* actions is a list of arrays, one for each building. If doing a single agent approach need to combine into one expected action
* Unclear how to best use the information returned by the obs_dict on a reset call
* Observation is a list of observations with a similar observation for each building
    * To do: scale the obs, understand the obs, find out if obs space shifts for buildings, see what is actually different in the obs for each building, see which parts of the obs space is shared between buildings

#### Create rllib wrapper for City Learn

In [9]:
# ## Confirm that rllib installed correctly
# ## this works but commenting out because output is huge
import ray
from ray import tune
# from ray.rllib.algorithms import ppo

# tune.run(
#     "PPO",
#     stop={"timesteps_total": 1000},
#     config={
#         "env": "CartPole-v1",
#         "framework": "torch",
#         "num_gpus": 1,
#         "num_workers": 0,
#         "lr": 0.001,
#         #"output": "/tmp/out", 
#         "batch_mode": "complete_episodes"
#     },
# )

In [10]:
import gym

gym.__version__

# city learn installs gym version 24 but ray requires version < 22. Need to see if compatibility is an issue

'0.21.0'

In [11]:
# Create a wrapper for City Learn taht works with rllib and test the wrapper 

import gym
import ray
import numpy as np
from ray import tune
from ray.tune.registry import register_env
from citylearn.citylearn import CityLearnEnv


class SingleAgentCityLearnEnv(gym.Env):
    def __init__(self, env):
        self.env = env
        # env.observation_space is a list of Box spaces, one for each building
        min_space = np.zeros((len(env.observation_space), env.observation_space[0].shape[0]), dtype='float32')
        max_space = np.zeros((len(env.observation_space), env.observation_space[0].shape[0]), dtype='float32')
        for i in range(len(env.observation_space)):
            min_space[i] = env.observation_space[i].low
            max_space[i] = env.observation_space[i].high
        self.observation_space = gym.spaces.Box(low=min_space, high=max_space, shape=(len(env.observation_space), env.observation_space[0].shape[0] ),
                                                                                     dtype=np.float32)
        
        # to do: other phases unclear what the sizes of the actions pace is going to be
        # get action space by iterating over given env. Each space is a list of boxes
        min_space = np.ones((len(env.action_space),), dtype='float32') * -1
        max_space = np.ones((len(env.action_space),), dtype='float32')
        for i in range(len(env.action_space)):
            min_space[i] = env.action_space[i].low[0]
            max_space[i] = env.action_space[i].high[0]
        # https://github.com/openai/gym/blob/master/gym/spaces/box.py
        self.action_space = gym.spaces.Box(low=min_space, high=max_space, shape=(len(env.action_space),), dtype=np.float32)
        
    def reset(self):
        # is a list of where each obs in obs_list is the observation of a building
        obs_list = self.env.reset() 
        obs = self._shape_obs(obs_list)
        
        return obs
    
    def step(self, action):
        # CL env expects action as a list of actions where each action is of shape (1,)
        #sample_action = [array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32)]
        #a = np.array([0.01])
        #sample_action = [a]*5
        action_list = self._shape_action(action)
        obs_list, reward_list, done, info = self.env.step(action_list)
        
        # reward is a list for each building. summing over all rewards but this loses some useful info
        reward = self._shape_reward(reward_list)
        
        obs = self._shape_obs(obs_list)
        
        return obs, reward, done, info
    
    def _shape_obs(self, obs_list):
        '''
        Turns obs from list of building obs into a combined observations
        '''
        # to do scale the obs passed on self.observation_space
        obs = np.array(obs_list, dtype='float32')
        
        return obs
    
    def _shape_action(self, action):
        '''
        Turns action from Box of full action space into what CL env expects action to be
        CL env expects action as a list of actions where each action is of shape (1,)
        sample_action = [array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32), array([0.091], dtype=float32)]
        a = np.array([0.01])
        sample_action = [a]*5
        '''
        action_list = []
        for action_index in range(self.action_space.shape[0]):
            action_list.append([action[action_index]])
        # print(action_list)
        
        return action_list
    
    def _shape_reward(self, reward_list):
        reward = np.sum(reward_list)
        
        return reward
        
    
cl_env = CityLearnEnv(schema='citylearn_challenge_2022_phase_1')
sa_env = SingleAgentCityLearnEnv(cl_env)
ray.rllib.utils.check_env(sa_env)



In [None]:
from ray import tune
from ray.tune.registry import register_env
from citylearn.citylearn import CityLearnEnv


def env_creator_sa(args):
    schema = 'citylearn_challenge_2022_phase_1'
    env = CityLearnEnv(schema=schema)
    sa_env = SingleAgentCityLearnEnv(env)
    
    return sa_env

env = env_creator_sa({})
register_env("citylearn", env_creator_sa)

tune.run(
    "PPO",
    stop={"timesteps_total": 1000},
    config={
        "env": "citylearn",#"CartPole-v1",
        "framework": "torch",
        "num_gpus": 0, #debug why this does not work when set to 1
        "num_workers": 0,
        "lr": 0.001,
        #"output": "/tmp/out", 
        "batch_mode": "complete_episodes"
    },
)

[2m[36m(PPOTrainer pid=9364)[0m 2022-07-31 04:07:49,820	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=9364)[0m 2022-07-31 04:07:49,820	INFO trainer.py:903 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364




Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364




Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364


Trial name,status,loc
PPO_citylearn_dcdee_00000,RUNNING,10.0.0.63:9364
