In [1]:

# import random
# random.seed(42)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm
import time
import tensorflow as tf
import gym
from stable_baselines3 import PPO

with open('/Users/faymajidelhassan/Downloads/ASAC_2023/observations.pickle', 'rb') as handle:
    obs = pickle.load(handle)
with open('/Users/faymajidelhassan/Downloads/ASAC_2023/actions.pickle', 'rb') as handle:
    actions = pickle.load(handle)

In [2]:


class ASACRLEnv(gym.Env):
    def __init__(self, observations, actions, action_parameter, action_space):
        """initialize action space, observation space & load data"""
        self.action_parameter = action_parameter
        self.action_space = gym.spaces.Discrete(len(action_space))  # Use Discrete action space
        self.observations = observations
        self.actions = actions
        self.index = 0
        self.teamindex = 0
        
        # Calculate observation space bounds
        obs_min = np.min(self.observations[self.teamindex], axis=0)
        obs_max = np.max(self.observations[self.teamindex], axis=0)

        # Ensure that obs_min and obs_max are numpy arrays with the correct data type
        obs_min = np.array(obs_min, dtype=np.float32)
        obs_max = np.array(obs_max, dtype=np.float32)

        self.observation_space = gym.spaces.Box(
            low=obs_min,
            high=obs_max,
            dtype=np.float32
        )

        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.curr_obs = self.observations[self.teamindex].iloc[self.index + 1]
        self.interval = action_space[1] - action_space[0]

    def step(self, action):
        reward = self.rewardfunc(action)
        self.index += 1
        cur_obs = self.curr_obs
        next_obs = self.next_obs
        done = False  # Set done to False by default
        if self.index >= len(self.observations[self.teamindex]) - 2:
            self.reset()
            done = True  # Set done to True when the episode is finished
        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.curr_obs = self.observations[self.teamindex].iloc[self.index + 1]
        return self.curr_obs, reward, done, {}


    def reset(self):
        """
        set index to 0 and increment team index by 1 if greater than 4 go back to 0
        """
        self.teamindex += 1
        if self.teamindex >= 5:
            self.teamindex = 0
        self.index = 0
        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.curr_obs = self.observations[self.teamindex].iloc[self.index + 1]
        return self.curr_obs

    def rewardfunc(self, action):
        
        # estimated_action = self.estimate_closest_as(self.actions[self.teamindex][self.action_parameter][self.index])
        if action == self.estimate_closest_as(self.actions[self.teamindex][self.action_parameter][self.index]):
            return 1000
        else:
            return -1 * abs(action - self.estimate_closest_as(self.actions[self.teamindex][self.action_parameter][self.index]))


    def estimate_closest_as(self, value):
        try:
            closest_action_index = int(value / self.interval)
            closest_action_index = min(closest_action_index, self.action_space.n - 1)
        except:
            closest_action_index = self.action_space.n - 1
        return closest_action_index


    def resetinit(self):
        self.teamindex = 0
        self.index = 0
        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.curr_obs = self.observations[self.teamindex].iloc[self.index + 1]
        return self.curr_obs


In [3]:
import gym
import numpy as np
import torch
from tqdm import tqdm
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from tensorboardX import SummaryWriter  # Import the SummaryWriter for TensorBoard
water_rl_actionspace=np.linspace(0,2000,9)
discrete=list(range(len(water_rl_actionspace)))
env=ASACRLEnv(obs,actions,"water_sup_intervals_sp_min",discrete)
env = DummyVecEnv([lambda: env])

# Environment settings
EPISODES = 1  # Number of episodes
# Exploration settings
epsilon = 0.2 # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.15

# Stats settings
AGGREGATE_STATS_EVERY = 500  # steps
SAVE_MODEL_EVERY = 5000
MODEL_NAME = "AGCRL_WATER_BIN"
MIN_REWARD = -5000


# Create a PPO agent
model = PPO("MlpPolicy", env, 
            batch_size=1024,
            clip_range=0.15,
            verbose=1)

# Create a TensorBoard summary writer
writer = SummaryWriter(log_dir='PPO_WATERRUN')


# Continuous training loop
ep_rewards = []
episode = 0
while True:
    episode += 1
    episode_reward = 0
    obs = env.reset()
    done = False

    while not done:
        if np.random.random() > epsilon:
            action, _ = model.predict(obs)
        else:
            action = env.action_space.sample()

        # Assuming action is a scalar value
        new_obs, reward, done, _ = env.step([action])

        episode_reward += reward
        obs = new_obs

        # Log information for TensorBoard
        writer.add_scalar('Chosen Action', action, episode)
        writer.add_scalar('Observed State', np.mean(obs), episode)  # You can customize how you log the state
        writer.add_scalar('Received Reward', reward, episode)

        if done:
            ep_rewards.append(episode_reward)

            if epsilon > MIN_EPSILON:
                epsilon *= EPSILON_DECAY
                epsilon = max(MIN_EPSILON, epsilon)

            if episode_reward >= MIN_REWARD and episode % SAVE_MODEL_EVERY == 100 :
                model.save(f'models/{MODEL_NAME}__ep_{episode}__reward_{float(episode_reward):.2f}.model')

            model.learn(total_timesteps=1)

            # Log metrics for TensorBoard
            writer.add_scalar('Episode Reward', episode_reward, episode)
            writer.add_scalar('Epsilon', epsilon, episode)

# Close the TensorBoard writer (this line will never be reached in infinite loop)
writer.close()

# Plot results (same as before)
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(ep_rewards)
plt.xlabel('Episode')
plt.ylabel('Episode Reward')
plt.title('Episode Rewards')

plt.subplot(1, 2, 2)
plt.plot(acc_regret)
plt.xlabel('Episode')
plt.ylabel('Accumulated Regret')
plt.title('Accumulated Regret')

plt.tight_layout()
plt.show()




Using cpu device
-----------------------------
| time/              |      |
|    fps             | 3375 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------
| time/              |      |
|    fps             | 3619 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------
| time/              |      |
|    fps             | 3494 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------
| time/              |      |
|    fps             | 3607 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------
| time/              |      |
|    fps             | 3625 |
|    iterations      | 1    |
|    time_elapsed    | 

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/faymajidelhassan/anaconda3/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/Users/faymajidelhassan/anaconda3/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 208, in run
    self._record_writer.write_event(data)
  File "/Users/faymajidelhassan/anaconda3/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 58, in write_event
    return self._write_serialized_event(event.SerializeToString())
  File "/Users/faymajidelhassan/anaconda3/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 63, in _write_serialized_event
    self._py_recordio_writer.write(event_str)
  File "/Users/faymajidelhassan/anaconda3/lib/python3.10/site-packages/tensorboardX/record_writer.py", line 189, in write
    w(data)
OSError: [Errno 28] No space left on device


KeyboardInterrupt: 