In [None]:
from deephive.optimizers import deephive as dh 
from deephive.environment.utils import * 
from datetime import datetime
import os 
import neptune 
from deephive.environment.deephive_utils import *

In [None]:
config_path = "../config/exp_config.json"
model_path = "../models/pbest_unfreeze.pth"
model_path_2 = "../models/gbest.pth"

def initialize_logger(api_token, tags, config, mode="train"):
    run = neptune.init_run(
    project="DMO-LAB/DeepHive-V2",
    source_files=["environment", "policies", "deephive.py", "config"],
    api_token=api_token,
    tags=[tags, mode, config["objective_function"], str(config["layer_size"])]
    )
    neptune_logger = run
    return neptune_logger

def get_action(observation_info, agent_policy, env, observation_std=None, random=False):
    # Ensure observation_info is a numpy array
    
    if random:
        print("Random action")
        return env.action_space.sample()
    
    observation = observation_info
    if not isinstance(observation, np.ndarray):
        observation = np.array(observation)
        assert observation.shape[0] == env.n_dim, "Observation must have the same number of dimensions as the environment"

    # Initialize observation_std with zeros or use provided std, ensuring it matches the shape of observation
    if observation_std is None:
        observation_std = np.zeros_like(observation)
    else:
        observation_std = np.array(observation_std)

    # Convert observations and stds to the appropriate format (Flatten if necessary)
    # Assuming observation and observation_std are 2D arrays of shape (n_agents, n_dim)
    # Flatten observation and observation_std for processing
    observation_flat = observation.reshape(env.n_agents * env.n_dim, -1)  # Flatten to 1D array
    observation_std_flat = observation_std.reshape(-1)  # Flatten to 1D array
    # Pass the entire flattened observation and std arrays to select_action
    action_flat = agent_policy.select_action(observation_flat, observation_std_flat)

    # Reshape the flattened action array back to the original (n_agents, n_dim) shape
    actions = action_flat.reshape(env.n_agents, env.n_dim)

    return actions  # Return the action



In [None]:
config = parse_config(config_path)
config["update_timestep"] = 25 
config["decay_rate"] = 0.95
config["n_dim"] = 2
config["include_gbest"] = False 
if config["include_gbest"]:
    config["obs_dim"] = 11
config["n_agents"] = 10
config["ep_length"] = 25
config["log_interval"] = 500
config["decay_interval"] = 1000
config["save_interval"] = 25000
config["min_action_std"] = 0.02
config["n_episodes"] = 5000
config["update_start"] = 0
config["action_std"] = 0.3
config["variable_std"]  = False
config["reward_scheme"] = "FullRewardScheme"
config['use_gbest'] = False
config['objective_function'] = "CosineMixtureFunction"
config["function_id"] = 2
config["use_optimal_value"] = True 
config["log_scale"] = False
mode = "train"
log = False 
tags = "new reward training"
env, agent_policy = initialize(config, mode=mode, model_path=model_path)
neptune_logger = None

In [None]:
obs = env.reset()
opt_params=np.array([[-55.2763985 , -70.42955972]])

In [None]:
env.render()

In [None]:
env.scaler_helper.scale(a_state[:,-1], env.worst_obj_value, -10000)

In [None]:
agent_policy.action_std

In [None]:

def train_agent(env, agent_policy, title, config, neptune_logger, n_episodes=None, update_timestep=None, decay_rate=None, log_interval=None, decay_interval=None, save_interval=None, min_action_std=None):
    # if parameters are not provided, use the ones from the config file
    if update_timestep is None:
        update_timestep = config["update_timestep"]
    if decay_rate is None:
        decay_rate =  config["decay_rate"]
    if log_interval is None:
        log_interval =  config["log_interval"]
    if decay_interval is None:
        decay_interval =  config["decay_interval"]
    if save_interval is None:
        save_interval =  config["save_interval"]
    if min_action_std is None:
        min_action_std =  config["min_action_std"]
    if n_episodes is None:
        n_episodes =  config["n_episodes"]

    average_returns = []
    training_run_title = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    training_run_title = title + "_" + training_run_title
    save_path = f"training_runs/{training_run_title}/"
    os.makedirs(save_path, exist_ok=True)
    timesteps = 0
    for episode in range(0, n_episodes+1):
        # if timesteps < config["update_start"]:
        #     print(f"Using random actions for episode {episode} - timesteps: {timesteps}")
        # if episode % log_interval == 0:
        #     print(f"Episode: {episode}, timesteps: {timesteps}")
        observation_info =  env.reset()
        obs, obs_std = observation_info 
        episode_return = np.zeros( env.n_agents)
        for step in range( env.ep_length):
            # if timesteps < config["update_start"]:
            #     actions =  get_action(obs, agent_policy, env, obs_std, random=True)
            # else:
                # print(f"Using policy actions for episode {episode} - timesteps: {timesteps}")
            actions =  get_action(obs, agent_policy, env, obs_std, random=False)
            observation_info, reward, done, info =  env.step(actions)
            obs, obs_std = observation_info
            # add reward to agent buffer
            for ag in range( env.n_agents):
                 agent_policy.buffer.rewards += [reward[ag]] *  env.n_dim
                 agent_policy.buffer.is_terminals += [done[ag]] *  env.n_dim
            
            episode_return += reward
            if  neptune_logger and episode % log_interval == 0:
                # log global best agent value
                neptune_logger[f"train/global_best_value/episode{episode}"].log(float(info["gbest"][-1]))
                    
            if step ==  env.ep_length - 1:
                average_returns.append(np.mean(episode_return))
                running_average_rewards = np.mean(average_returns)
                if neptune_logger:
                    neptune_logger["train/average_return"].log(average_returns[-1])
                    neptune_logger["train/percentage_high_std"].log( env.surrogate.percent_high_std)
            timesteps += 1
        
        if timesteps % update_timestep == 0:
            #print(f"Updating policy at timestep {timesteps}")
            agent_policy.update()
    
        if timesteps > 0 and episode % log_interval == 0:
            print_items(
                    episode = episode,
                    average_returns = average_returns[-1],
                    timesteps = timesteps,
                    )
            # if env.use_surrogate:
            #     env.surrogate.plot_checkpoints_state(f"{save_path}surrogate-checkpoint-{episode}.png")
            #     env.surrogate.plot_variance(f"{save_path}variance-{episode}.png")
            #     env.surrogate.plot_surrogate(f"{save_path}surrogate-{episode}.png")
            #     if  neptune_logger:
            #         neptune_logger[f"train/plots/surrogate/surrogate-{episode}"].upload(f"{save_path}surrogate-{episode}.png")
            #         neptune_logger[f"train/plots/variance/variance-{episode}"].upload(f"{save_path}variance-{episode}.png")
            #         neptune_logger[f"train/plots/surrogate_checker/surrogate-checkpoint-{episode}"].upload(f"{save_path}surrogate-checkpoint-{episode}.png")
                

            if env.n_dim <= 2 and episode % log_interval == 0:
                env.render(file_path=f"{save_path}{episode}.gif", type="history")
                if neptune_logger:
                    neptune_logger[f"train/gifs/{episode}.gif"].upload(f"{save_path}{episode}.gif")
                    
        if timesteps % decay_interval == 0:
            print(f"Decaying action std at timestep {timesteps}")
            agent_policy.decay_action_std(decay_rate, min_action_std=min_action_std, debug=True)
            
        if timesteps % save_interval == 0 and timesteps > 0:
            if average_returns[-1] > running_average_rewards:
                print(f"Average return: {average_returns[-1]}, running average: {running_average_rewards}")
                agent_policy.save(save_path, episode=timesteps)
                if  neptune_logger:
                    neptune_logger[f"train/checkpoints/timesteps-{timesteps}"].upload(f"{save_path}/policy-{timesteps}.pth")
                    

In [None]:
train_agent(env, agent_policy, "cosine_mixture", config, neptune_logger)

In [8]:
env.state

array([[0.447 , 0.4622, 0.8691],
       [0.7086, 0.2081, 0.5039],
       [0.482 , 0.5702, 0.8642],
       [0.839 , 0.7584, 0.3159],
       [0.475 , 0.52  , 0.9636],
       [0.01  , 0.481 , 0.1858],
       [0.2742, 0.845 , 0.4267],
       [0.8721, 0.6645, 0.4772],
       [0.4423, 0.1608, 0.4935],
       [0.7877, 0.2076, 0.263 ]])

In [9]:
env.opt_value

0.2

In [10]:
np.round(env._get_actual_state(), 2)

array([[-0.11, -0.08,  0.01],
       [ 0.42, -0.58, -0.52],
       [-0.04,  0.14,  0.  ],
       [ 0.68,  0.52, -0.79],
       [-0.05,  0.04,  0.15],
       [-0.98, -0.04, -0.97],
       [-0.45,  0.69, -0.63],
       [ 0.74,  0.33, -0.55],
       [-0.12, -0.68, -0.53],
       [ 0.58, -0.58, -0.86]])

In [None]:
observation = env.observation_schemes.generate_observation(pbest=env.pbest.copy(), use_gbest=env.use_gbest, ratio=env.split_ratio, include_gbest=env.include_gbest)

In [None]:
obs = observation[0]

In [None]:
obs[0][0]

In [None]:
action = get_action(obs, agent_policy, env, random=False)

In [None]:
action

In [None]:
env.state_history[1, :, :-2]

In [None]:
len(agent_policy.buffer.rewards)

In [None]:
len(agent_policy.buffer.states)

## DEEPHIVE TRAINER

In [None]:
import argparse

In [None]:
config_path = "../config/exp_config.json"
model_path = "../models/pbest_unfreeze.pth"
model_path_2 = "../models/gbest.pth"

config = parse_config(config_path)

config["update_timestep"] = 1000
config["decay_rate"] = 0.95
config["log_interval"] = 500
config["decay_interval"] = 2000
config["save_interval"] = 2000
config["min_action_std"] = 0.02
config["n_episodes"] = 5000
config["update_start"] = 0
config["action_std"] = 0.2
config["variable_std"]  = False
config["reward_scheme"] = "FullRewardScheme"
config["use_gbest"] = False

deephive_trainer = dh.OptimizationTrainer(config, mode="train")

In [None]:
deephive_trainer.train_agent(title="old_reward_pbest")