In [7]:
import gym
import numpy as np
import pickle
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from tensorboardX import SummaryWriter
from collections import deque
import numpy as np
# np.random.seed(42)  # You can choose any number for the seed


In [8]:
import gym
import numpy as np

class ASACRLEnv(gym.Env):
    def __init__(self, observations, actions, action_parameter, action_space, num_intervals):
        """initialize action space, observation space & load data"""
        self.action_parameter = action_parameter
        self.num_intervals = num_intervals
        self.action_space = gym.spaces.Box(low=action_space[0], high=action_space[1], dtype=np.float32)
        self.observations = observations
        self.actions = actions
        self.index = 0
        self.teamindex = 0
        obs_min = np.min(np.concatenate(self.observations), axis=0)
        obs_max = np.max(np.concatenate(self.observations), axis=0)

        self.observation_space = gym.spaces.Box(
            low=obs_min,
            high=obs_max,
            dtype=np.float32
        )

        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.curr_obs = self.observations[self.teamindex].iloc[self.index + 1]
        self.curr_reward = 0
        self.ep_reward = 0
        self.interval = (self.action_space.high[0] - self.action_space.low[0]) / self.num_intervals

    def step(self, action):
        chosen_action = self.map_continuous_to_interval(action)
        self.reward = self.rewardfunc(chosen_action)
        self.ep_reward += self.reward
        self.index += 1
        done = False  # Set done to False by default

        if self.index >= len(self.observations[self.teamindex]) - 2:
            self.reset()
            done = True  # Set done to True when the episode is finished
            return self.curr_obs, self.reward, done, {}  # Return the additional info as an empty dictionary

        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.observations[self.teamindex].iloc[self.index + 1]
        return self.curr_obs, self.reward, done, {}

    def map_continuous_to_interval(self, action):
        interval_width = (self.action_space.high - self.action_space.low) / self.num_intervals
        discrete_action = int((action - self.action_space.low) / interval_width)
        return discrete_action * interval_width + self.action_space.low

    def rewardfunc(self, action):
        if action == self.estimate_closest_as(self.actions[self.teamindex][self.action_parameter][self.index]):
            return 1000
        else:
            return -1 * abs(action - self.estimate_closest_as(self.actions[self.teamindex][self.action_parameter][self.index]))


    def estimate_closest_as(self, value):
        try:
            closest_action_index = int(value / self.interval)
            max_index = int((self.action_space.high[0] - self.action_space.low[0]) / self.interval)
            closest_action_index = min(closest_action_index, max_index)
        except ValueError:
            closest_action_index = max_index
        return closest_action_index
    # def estimate_closest_as(self, value):
    #     try:
    #         closest_action_index = int(value / self.interval)
    #         closest_action_index = min(closest_action_index, self.action_space.n - 1)
    #     except:
    #         closest_action_index = self.action_space.n - 1
    #     return closest_action_index


    def reset(self):
        self.teamindex += 1
        if self.teamindex >= 5:
            self.teamindex = 0
        self.index = 0
        self.curr_obs = self.observations[self.teamindex].iloc[self.index]
        self.next_obs = self.observations[self.teamindex].iloc[self.index + 1]
        self.curr_reward = 0
        self.ep_reward = 0
        return self.curr_obs


In [9]:
with open('/Users/faymajidelhassan/Downloads/ASAC_2023/observations.pickle', 'rb') as handle:
    obs = pickle.load(handle)
with open('/Users/faymajidelhassan/Downloads/ASAC_2023/actions.pickle', 'rb') as handle:
    actions = pickle.load(handle)

In [11]:
!pip install shimmy
!pip install tensorboard
# !pip install stable_baselines3
!pip install gymnasium



In [16]:
import gym
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecNormalize
from tensorboardX import SummaryWriter
assim_rl_actionspace = np.linspace(0, 100, 21)


# Create an instance of your custom environment
env = ASACRLEnv(observations=obs, actions=actions, action_parameter="assim_sp", action_space=assim_rl_actionspace, num_intervals=len(assim_rl_actionspace) - 1)

# Wrap the environment with DummyVecEnv and VecNormalize
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_obs=True, norm_reward=True)


# Exploration settings
epsilon = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

# Stats settings
AGGREGATE_STATS_EVERY = 500
SAVE_MODEL_EVERY = 100
MODEL_NAME = "AGCRL_ASSIM_CONT_SAC"
MIN_REWARD = -5000

# Create a SAC agent
model = SAC("MlpPolicy", env, verbose=1, gradient_steps=-1, learning_rate=0.003, ent_coef=0)


# Create a TensorBoard summary writer
writer = SummaryWriter(log_dir='SAC_bin')

# Continuous training loop
ep_rewards = deque(maxlen=100)  # Stores the last 100 episode rewards
episode = 0

while True:
    episode += 1
    episode_reward = 0
    obs = env.reset()
    done = False

    while not done:
        if np.random.random() > epsilon:
            action, _ = model.predict(obs)
        else:
            action = env.action_space.sample()

        new_obs, reward, done, _ = env.step(action)

        episode_reward += reward
        obs = new_obs

        # Log information for TensorBoard (every few steps for efficiency)
        if episode % AGGREGATE_STATS_EVERY == 500:
            writer.add_scalar('Chosen Action', action, episode)
            writer.add_scalar('Observed State', np.mean(obs), episode)  # You can customize how you log the state
            writer.add_scalar('Received Reward', reward, episode)

    ep_rewards.append(episode_reward)

    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

    if episode_reward >= MIN_REWARD and episode % SAVE_MODEL_EVERY == 100:
        model.save(f'{MODEL_NAME}__ep_{episode}__reward_{float(episode_reward):.2f}.model')

    # Update the model based on a larger number of timesteps
    model.learn(total_timesteps=1)  # Adjust the total_timesteps as needed

    # Log metrics for TensorBoard
    writer.add_scalar('Episode Reward', episode_reward, episode)
    writer.add_scalar('Epsilon', epsilon, episode)
    writer.add_scalar('Average Reward (last 100 eps)', np.mean(ep_rewards), episode)

    # # Termination condition (you can adjust this)
    # if len(ep_rewards) == 100 and np.mean(ep_rewards) > YOUR_DESIRED_REWARD_THRESHOLD:
    #     break

# Close the TensorBoard writer
writer.close()





Using cpu device


OSError: [Errno 28] No space left on device: '/var/folders/vx/1m1l_7hj50l31_jjvmr4z0hw0000gn/T/SB3-2023-08-24-20-27-52-565068'