In [None]:
import sys 
sys.path.append('../')
import os
from deephive.environment.deephive_utils import *
from deephive.environment.utils import *
import numpy as np 
from deephive.exploration.gp_surrogate import GPSurrogateModule
from deephive.environment.utils import filter_points
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.gaussian_process.kernels import Matern
from dotenv import load_dotenv
load_dotenv()
import matplotlib.pyplot as plt
import json
from datetime import datetime
import neptune
import torch 

# from deephive.environment.utils import num_function_evaluation, plot_individual_function_evaluation, plot_num_function_evaluation

In [None]:
# #Experiment 1: VARIANCE DRIVEN EXPLORERS AND EXPLOITERS
config_path = "../config/train_config.json"
model_path = "../models/pbest_unfreeze.pth"
model_path_2 = "../models/gbest.pth"

config = parse_config(config_path)


In [None]:
def initialize_logger(api_token, config, tag=""):
        run = neptune.init_run(
        project="DMO-LAB/DeepHive-V2",
        source_files=["environment", "policies", "deephive.py", "config"],
        api_token=api_token,
        tags=[config["mode"], config["objective_function"], str(config["layer_size"]), tag]
        )
        neptune_logger = run
        return neptune_logger


def get_action(observation_info, agent_policy, env, observation_std=None, debug=False):
    observation = observation_info
    # enforce observation_std to be a numpy array
    if observation_std is None:
        observation_std = np.zeros_like(observation)
    else:
        observation_std = np.array(observation_std)
    actions = np.zeros((env.n_agents, env.n_dim))
    for dim in range(env.n_dim):
        observation[dim] = observation[dim]#.astype(np.float32)
        if observation_std is not None:
            observation_std[dim] = observation_std[dim]#.astype(np.float32) 
        else:
            observation_std[dim] = np.zeros_like(observation[dim])
        action = agent_policy.select_action(observation[dim], observation_std[dim])
        actions[:, dim] = action
    return actions

def get_std_obs(env):
    states = env.state
    scaled_gbest = env.scaler_helper.scale(env.gbest[:env.n_dim],env.min_pos, env.max_pos)
    mean, var = np.mean(states, axis=0), np.var(states, axis=0)
    std_obss = []
    for i in range(env.n_dim):
        agent_std_obs = []
        for agent, agent_state in enumerate(states):
            obs = [
                var[i],
                agent_state[i] - mean[i],
                agent_state[env.n_dim] - mean[env.n_dim],
                agent_state[i] - scaled_gbest[i],
                agent_state[env.n_dim] - 1,
                agent_state[i] - env.prev_state[agent][i],
                agent_state[env.n_dim] - env.prev_state[agent][env.n_dim],
            ]
            agent_std_obs.append(np.abs(np.array(obs)))
        std_obss.append(np.array(agent_std_obs))
        
    return std_obss

def train_agent(env, agent, neptune_logger=None, save_dir=None, **kwargs):
    # Setting default values if not provided
    n_episodes = kwargs.get('n_episodes', 2000)
    update_timestep = kwargs.get('update_timestep', 20)
    decay_rate = kwargs.get('decay_rate', 0.95)
    log_interval = kwargs.get('log_interval', 500)
    decay_interval = kwargs.get('decay_interval', 1000)
    save_interval = kwargs.get('save_interval', 5000)
    min_action_std = kwargs.get('std_min', 0.02)
    debug = kwargs.get('debug', False)
    #print(debug)
    
    average_returns = []
    if not save_dir:
        training_run_title = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
        save_path = f"training_runs/{training_run_title}/"
        os.makedirs(save_path, exist_ok=True)
    else:
        save_path = save_dir
    timesteps = 0
    for episode in range(0, n_episodes+1):
        observation_info, role = env.reset()
        agent.policy.reset_min_std()
        obs_std = get_std_obs(env)
        episode_return = np.zeros(env.n_agents)
        for step in range(env.ep_length):
            actions = get_action(observation_info, agent, env, observation_std=obs_std, debug=debug)
            observation_info, reward, done, info = env.step(actions)
            observation_info, role = observation_info
            obs_std = get_std_obs(env)
            #print(obs_std)
            # add reward to agent buffer
            for ag in range(env.n_agents):
                agent.buffer.rewards += [reward[ag]] * env.n_dim
                agent.buffer.is_terminals += [done[ag]] * env.n_dim
            
            if neptune_logger:
                neptune_logger[f"train/mean_predicted_action_std"].log(agent.policy.mean_predicted_action_std)
                neptune_logger[f"train/mean_min_action_std"].log(agent.policy.mean_min_action_std)
                neptune_logger[f"train/mean_applied_action_var"].log(agent.policy.mean_applied_action_vars)
            # if episode % log_interval == 0:
            #     print(f"Mean predicted action std: {agent.policy.mean_predicted_action_std}, mean min action std: {agent.policy.mean_min_action_std}")
            
            episode_return += reward
            if neptune_logger and episode % log_interval == 0:
                # log global best agent value
                neptune_logger[f"train/global_best_value/episode{episode}"].log(float(info["gbest"][-1]))
            
            if step == env.ep_length - 1:
                average_returns.append(np.mean(episode_return))
                running_average_rewards = np.mean(average_returns)
                if neptune_logger:
                    neptune_logger["train/average_return"].log(average_returns[-1])

                
            timesteps += 1
        if timesteps % update_timestep == 0:
            agent.update()
    
        if episode % log_interval == 0 and timesteps > 0:
            print_items(
                    episode = episode,
                    average_returns = average_returns[-1],
                    timesteps = timesteps,
                )
            # predicted_action_std = [agent.policy.std_actor(torch.tensor(obs).float()) for obs in obs_std]
            # print(predicted_action_std)
            env.render(file_path=f"{save_path}{episode}.gif", type="history")
            if neptune_logger:
                neptune_logger[f"train/gifs/{episode}.gif"].upload(f"{save_path}{episode}.gif")

            
        if timesteps % decay_interval == 0:
            agent.decay_action_std(decay_rate, min_action_std=min_action_std,debug=True)
            
        if timesteps % save_interval == 0 and timesteps > 0:
            if average_returns[-1] > running_average_rewards:
                print(f"Average return: {average_returns[-1]}, running average: {running_average_rewards}")
                agent.save(save_path, episode=timesteps)
                if neptune_logger:
                    neptune_logger[f"train/checkpoints/timesteps-{timesteps}"].upload(f"{save_path}/policy-{timesteps}.pth")
    neptune_logger.stop()
    return average_returns, agent, env

In [None]:
def test_agent(env, agent, iters, save_dir=None, **kwargs):
    log_interval = kwargs.get('log_interval', 200)
    decay_interval = kwargs.get('decay_interval', 1000)
    test_std = kwargs.get('test_std', 0.02)
    decay_rate = kwargs.get('decay_rate', 0.9)
    min_action_std = kwargs.get('std_min', 0.02)
    
    
    if not save_dir:
        training_run_title = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
        save_path = f"testing_runs/{training_run_title}/"
        os.makedirs(save_path, exist_ok=True)
    else:
        save_path = save_dir
    timesteps = 0
    all_gbests = []
    for episode in range(0, iters):
        observation_info, role = env.reset()
        episode_gbests = []
        agent.set_action_std(test_std)
        for step in range(env.ep_length):
            actions = get_action(observation_info, agent, env, observation_std=role)
            observation_info, reward, done, info = env.step(actions)
            observation_info, role = observation_info
            episode_gbests.append(info["gbest"][-1])
            if decay_interval and timesteps % decay_interval == 0:
                agent.decay_action_std(decay_rate, min_action_std=min_action_std)
            
        if log_interval and episode % log_interval == 0:
            _ = env.render(file_path=f"{save_path}{episode}.gif", type="history")
        
        
        all_gbests.append(episode_gbests)
        
    np.save(f"{save_path}gbests.npy", np.array(all_gbests))
    all_gbests = np.array(all_gbests)
    title = f"Test run, std"
    opt_value = env.opt_value
    num_function_evaluation(fopt=all_gbests ,n_agents=env.n_agents, save_dir=save_path + "num_function_evaluations.png", opt_value=opt_value,
                                    log_scale=False, plot_error_bounds=True, title=title)
            
    plot_individual_function_evaluation(fopt=all_gbests ,n_agents=env.n_agents, save_dir=save_path + "num_function_evaluations2.png", opt_value=opt_value,
                                    log_scale=False, title=title)

    
    return all_gbests, save_path

# TRAINING

In [None]:
exp_num = 62
exp_name = "exp_" + str(exp_num)
result_path = 'experiments/training/results/' + exp_name + '/'
os.makedirs(result_path, exist_ok=True)
api_token = os.environ.get("NEPTUNE_API_TOKEN")
config['freeze'] = True
config['use_gbest'] = True
# config['variable_std'] = False
config['action_std'] = 0.5
config['decay_rate'] = 0.97
config['log_interval'] = 500
config['std_min'] = 0.02
config['use_optimal_value'] = True
config['tol'] = .99
config['ep_length'] = 20
config['n_agents'] = 10
config['n_dim'] = 2
config['objective_function'] = "CosineMixtureFunction"
config['reward_scheme'] = "FullRewardScheme"
mode = "train"
config["save_interval"] = 2500
config["n_episodes"] = 5000
config["lr"] = 0.0001
config["layer_size"] = [16, 16]
config["variable_std"] = True
config['mode'] = mode   
config["use_std_model"] = True
config["std_model_layers"] = [16, 16]
config["decay_interval"] = 1000
# config["decay_rate"] = 0.95
config["std_obs_dim"] = 7
config["negative"] = True
config["function_id"] = 0
config['update_timestep'] = 1000
config["use_grid"] = False
env, agent_policy = initialize(config, mode=mode)

In [None]:

# agent = 0
# observation_info = env.reset()
# observation_info, role = observation_info
# obs_std = get_std_obs(env)
# agent = agent_policy
# actions = get_action(observation_info, agent, env, observation_std=obs_std, debug=True)
# observation_info, reward, done, info = env.step(actions)

# import torch.distributions as tdist
# obs_std = np.array(obs_std)

# action_mean = agent_policy.policy.actor(torch.tensor(obs[0]).float())
# action_std = 0.2*agent_policy.policy.std_actor(torch.tensor(obs_std[0]).float())

# dist = tdist.Normal(action_mean, action_std[2])
# action = dist.sample()
# dist.log_prob(action).sum(dim=1)
# obs = np.array(observation_info)

In [None]:
experiment = "learning_std"
neptune_logger = initialize_logger(api_token, config, tag=experiment)

In [None]:
train_agent(env, agent_policy, neptune_logger=neptune_logger, save_dir=result_path, **config, debug=False)

In [9]:
agent_policy.action_std

0.307

In [10]:
neptune_logger.stop()

Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/DMO-LAB/DeepHive-V2/e/DEEP-118/metadata


In [None]:
import torch

In [11]:
_ = env.reset()
env.state

array([[0.105 , 0.405 , 0.4656],
       [0.27  , 0.755 , 0.6095],
       [0.685 , 0.775 , 0.6142],
       [0.085 , 0.95  , 0.    ],
       [0.21  , 0.405 , 0.5239],
       [0.01  , 0.35  , 0.1642],
       [0.1   , 0.39  , 0.4512],
       [0.255 , 0.38  , 0.6503],
       [0.285 , 0.08  , 0.4282],
       [0.3   , 0.6   , 0.7515]])

In [12]:
get_std_obs(env)

[array([[3.195225e-02, 1.255000e-01, 2.600000e-04, 1.950000e-01,
         5.344000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 3.950000e-02, 1.436400e-01, 3.000000e-02,
         3.905000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 4.545000e-01, 1.483400e-01, 3.850000e-01,
         3.858000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 1.455000e-01, 4.658600e-01, 2.150000e-01,
         1.000000e+00, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 2.050000e-02, 5.804000e-02, 9.000000e-02,
         4.761000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 2.205000e-01, 3.016600e-01, 2.900000e-01,
         8.358000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 1.305000e-01, 1.466000e-02, 2.000000e-01,
         5.488000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 2.450000e-02, 1.844400e-01, 4.500000e-02,
         3.497000e-01, 0.000000e+00, 0.000000e+00],
        [3.195225e-02, 5.450000e-02, 3.766000e-02, 1.500000e-02,

In [13]:

obs_std = get_std_obs(env)
agent_policy.policy.std_actor(torch.tensor(obs_std[0]).float())

tensor([[0.6740],
        [0.6732],
        [0.6739],
        [0.6653],
        [0.6731],
        [0.6690],
        [0.6738],
        [0.6720],
        [0.6753],
        [0.6720]], grad_fn=<SigmoidBackward0>)

In [14]:
print(agent_policy.policy.std_actor(torch.ones_like(torch.tensor(obs_std[1]).float())))
print(agent_policy.policy.std_actor(torch.zeros_like(torch.tensor(obs_std[1]).float())))

tensor([[0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291],
        [0.7291]], grad_fn=<SigmoidBackward0>)
tensor([[0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732],
        [0.6732]], grad_fn=<SigmoidBackward0>)


In [None]:
agent_policy.policy.std_actor.state_dict()

In [None]:
agent_policy.policy.std_actor.state_dict()

# TESTING

In [None]:
testing_config = config.copy()
config['freeze'] = False
config['use_gbest'] = False
config['variable_std'] = False
config['test_std'] = 0.2
config['decay_interval'] = 2
config['log_interval'] = 10 
config['decay_rate'] = 0.8 
config['std_min'] = 0.00001
config['use_optimal_value'] = False
config['ep_length'] = 20 
config['n_agents'] = 10 
config['n_dim'] = 2
config['objective_function'] = "GaussianPeakFunction"
mode = "test" 
config["iters"] = 100 
config["layer_size"] = [32, 32]
config['mode'] = mode   
exp_num = 27
model_timestep = 40000
model_path = f"experiments/training/results/exp_{exp_num}/policy-{model_timestep}.pth"
env, agent_policy = initialize(config, mode=mode, model_path=model_path)

In [None]:

all_gbests, save_dir = test_agent(env, agent_policy, **config, save_dir=None)


In [None]:
agent_policy.policy.state_dict()

In [None]:
import numpy as np

def rosenbrock_function_modified(params: np.ndarray, x_opt: np.ndarray = None, f_opt: float = 0) -> np.ndarray:
    if x_opt is None:
        x_opt = np.ones(params.shape[1])  # Assuming x_opt = 1 for all dimensions
    
    D = params.shape[1]

    # Compute the scaling factor
    scaling_factor = max(1, np.sqrt(D / 8))
    
    # Transform x to z
    z = scaling_factor * (params - x_opt) + 1
    
    # Apply the Rosenbrock formula
    summands = 100.0 * (z[:, :-1]**2 - z[:, 1:])**2 + (z[:, :-1] - 1)**2
    result = np.sum(summands, axis=1)
    
    # Since you're maximizing, negate the function (subtract from f_opt if used)
    return -(result + f_opt)

# Example usage
D = 3 # Dimensionality of the problem
params = np.array([[1, 1, 1], [0, 0, 0]])  # Example parameters

rosenbrock_values = rosenbrock_function_modified(params)
print(rosenbrock_values)


In [None]:
def bent_cigar_function(params: np.ndarray, f_opt: float = 0) -> np.ndarray:
    """
    Bent Cigar Function.
    
    Args:
        params (np.ndarray): Parameters for the Bent Cigar function.
        f_opt (float): Optimal function value shift.
    
    Returns:
        np.ndarray: Function values.
    """
    # Assuming z = x - x_opt and x_opt = 0 for simplicity
    z = params  # This would be different if rotation and asymmetry were applied
    
    # Calculate the function value
    term1 = z[:, 0]**2
    term2 = 10**6 * np.sum(z[:, 1:]**2, axis=1)
    result = term1 + term2 + f_opt
    
    return -result

# Example usage
D = 2  # Dimensionality of the problem
params = np.random.randn(10, D)  # Example parameters: 10 sets of D-dimensional inputs
print(params)
bent_cigar_values = bent_cigar_function(params)
print(bent_cigar_values)


In [None]:
def schaffers_f7_function(params: np.ndarray, f_opt: float = 0) -> np.ndarray:
    """
    Schaffer's F7 Function.
    
    Args:
        params (np.ndarray): Parameters for the Schaffer's F7 function.
        f_opt (float): Optimal function value shift.
    
    Returns:
        np.ndarray: Function values.
    """
    D = params.shape[1]  # Dimensionality of the input
    
    # Calculate s_i
    z = params  # Assuming z = x for simplicity
    s_i = np.sqrt(z[:, :-1]**2 + z[:, 1:]**2)
    
    # Calculate the function value
    term = (np.sqrt(s_i) + np.sqrt(s_i) * np.sin(50 * s_i**0.2)**2)**2
    result = np.mean(term, axis=1) + f_opt  # Assuming f_opt includes f_pen(x) if necessary
    
    return result

# Example usage
D = 5  # Dimensionality of the problem
params = np.random.randn(10, D)  # Example parameters: 10 sets of D-dimensional inputs

schaffers_f7_values = schaffers_f7_function(params)
print(schaffers_f7_values)
