# FINAL MAIN WITH AGENTS FROM 1 TO 6

## IMPORTS

In [None]:
import gymnasium as gym
import torch
from utils import *
%load_ext autoreload
%autoreload 2

## COMMON PARAMETERS

In [None]:
# HYPERPARAMETERS
# environment hyperparams
n_steps = 500000 # total credit to train the agent
n_eval_runs = 10 # how many evaluation runs to do each 20k steps 

# Aggregate parameters
n_seeds = 3 # number of random seeds for the aggregation of plots
agents_seeds = [10, 42, 81] # Arbitrary seeds for the agents

# agent hyperparams
gamma = 0.99  # discount factor
ent_coef = 0.01  # coefficient for the entropy bonus (to encourage exploration)
actor_lr = 1e-5
critic_lr = 1e-3
stochastic_reward_probability = 0.9
# Note: the actor has a slower learning rate so that the value targets become
# more stationary and are theirfore easier to estimate for the critic

# DEVICE
device = torch.device("cpu")


## DISCRETE CASE 1-4

In [None]:
# use CartPole for the discrete case
environment = "CartPole-v1"
env_eval = gym.make(environment)
obs_shape = env_eval.observation_space.shape[0]
action_space_dims = env_eval.action_space.n
bool_discrete = True

### AGENT 1.A (K = 1 | n = 1) DETERMINISTIC

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "1A" # ID of the Agent for the report [1:6]
n_envs = 1
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 1
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = False

#### TRAINING AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_1A = values.copy()
critic_losses_agent_1A = critic_losses.copy()
actor_losses_agent_1A = actor_losses.copy()
entropies_agent_1A = entropies.copy()
evaluation_returns_seeds_agent_1A = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

### AGENT 1.B (K = 1 | n = 1) STOCHASTIC

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "1B" # ID of the Agent for the report [1:6]
n_envs = 1
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 1
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_1B = values.copy()
critic_losses_agent_1B = critic_losses.copy()
actor_losses_agent_1B = actor_losses.copy()
entropies_agent_1B = entropies.copy()
evaluation_returns_seeds_agent_1A = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

### AGENT 2 (K = 6 | n = 1)

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "2" # ID of the Agent for the report [1:6]
n_envs = 6
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 1
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_2 = values.copy()
critic_losses_agent_2 = critic_losses.copy()
actor_losses_agent_2 = actor_losses.copy()
entropies_agent_2 = entropies.copy()
evaluation_returns_seeds_agent_2 = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

### AGENT 3 (K = 1 | n = 6)

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "3" # ID of the Agent for the report [1:6]
n_envs = 1
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 6
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_3 = values.copy()
critic_losses_agent_3 = critic_losses.copy()
actor_losses_agent_3 = actor_losses.copy()
entropies_agent_3 = entropies.copy()
evaluation_returns_seeds_agent_3 = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

### AGENT 4 (K = 6 | n = 6)

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "4" # ID of the Agent for the report [1:6]
n_envs = 6
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 6
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_4 = values.copy()
critic_losses_agent_4 = critic_losses.copy()
actor_losses_agent_4 = actor_losses.copy()
entropies_agent_4 = entropies.copy()
evaluation_returns_seeds_agent_4 = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

## CONTINUOUS CASE 5-6

In [None]:
# use CartPole for the discrete case
environment = "InvertedPendulum-v4"
env_eval = gym.make(environment)
obs_shape = env_eval.observation_space.shape[0]
action_space_dims = 1 ## Continuous case ==> 1 dimension of action space: continuous force between [-3, 3] Newton
bool_discrete = False

### AGENT 5 (K = 1 | n = 1)

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "5" # ID of the Agent for the report [1:6]
n_envs = 1
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 1
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_5 = values.copy()
critic_losses_agent_5 = critic_losses.copy()
actor_losses_agent_5 = actor_losses.copy()
entropies_agent_5 = entropies.copy()
evaluation_returns_seeds_agent_5 = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)

### AGENT 6 (K = 6 | n = 6)

#### HYPERPARAMETERS SETUP

In [None]:
id_agent = "6" # ID of the Agent for the report [1:6]
n_envs = 6
envs = []
for i in range(n_envs):
    env = gym.make(environment)
    envs.append(env)

n_steps_per_update = 6
n_updates = n_steps // (n_steps_per_update*n_envs)
evaluation_interval = 20000//(n_steps_per_update*n_envs) # evaluate the agent every 20k steps
n_evaluations = n_updates // evaluation_interval
stochasticity_bool = True

#### TRAIN AGENT

In [None]:
# Logging training variables
values,critic_losses,actor_losses,entropies,evaluation_returns_seeds= trainAgent(agents_seeds,n_seeds,envs,env_eval,n_updates,bool_discrete,obs_shape,action_space_dims,device,critic_lr, actor_lr, n_envs,n_steps_per_update, evaluation_interval, n_eval_runs,stochasticity_bool,stochastic_reward_probability,gamma)

# Logging variables for each agent
values_agent_6 = values.copy()
critic_losses_agent_6 = critic_losses.copy()
actor_losses_agent_6 = actor_losses.copy()
entropies_agent_6 = entropies.copy()
evaluation_returns_seeds_agent_6 = evaluation_returns_seeds.copy()

#### PLOTTING

In [None]:
plotAggregated(values,n_seeds,agents_seeds,critic_losses,actor_losses,entropies,id_agent,n_steps_per_update,n_envs,n_steps,stochasticity_bool,evaluation_returns_seeds)