In [1]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque
import optuna

In [3]:
#discount factor for future utilities
DISCOUNT_FACTOR = 0.99

#number of episodes to run
NUM_EPISODES = 500

#max steps per episode
MAX_STEPS = 500

#score agent needs for environment to be solved
SOLVED_SCORE = 195

#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  and should_run_async(code)


In [4]:
#Using a neural network to learn our policy parameters
class PolicyNetwork(nn.Module):

    #Takes in observations and outputs actions
    def __init__(self, observation_space, action_space,fp ,seed):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, fp)
        self.output_layer = nn.Linear(fp, action_space)

    #forward pass
    def forward(self, x):
        #input states
        x = self.input_layer(x)

        #relu activation
        x = F.relu(x)

        #actions
        actions = self.output_layer(x)

        #get softmax for a probability distribution
        action_probs = F.softmax(actions, dim=1)

        return action_probs

In [5]:
#Using a neural network to learn state value
class StateValueNetwork(nn.Module):

    #Takes in state
    def __init__(self, observation_space, fs, seed):
        super(StateValueNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.input_layer = nn.Linear(observation_space, fs)
        self.output_layer = nn.Linear(fs, 1)

    def forward(self, x):
        #input layer
        x = self.input_layer(x)

        #activiation relu
        x = F.relu(x)

        #get state value
        state_value = self.output_layer(x)

        return state_value

In [6]:
def select_action(network, state):
    ''' Selects an action given current state
    Args:
    - network (Torch NN): network to process state
    - state (Array): Array of action space in an environment

    Return:
    - (int): action that is selected
    - (float): log probability of selecting that action given state and network
    '''

    #convert state to float tensor, add 1 dimension, allocate tensor on device
    state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)

    #use network to predict action probabilities
    action_probs = network(state)
    state = state.detach()

    #sample an action using the probability distribution
    m = Categorical(action_probs)
    action = m.sample()

    #return action
    return action.item(), m.log_prob(action)

In [7]:
def return_calc(rewards):
    ''' Converts our rewards history into cumulative discounted rewards
    Args:
    - rewards (Array): array of rewards

    Returns:
    - G (Array): array of cumulative discounted rewards
    '''
    #Calculate Gt (cumulative discounted rewards)
    G = []

    #track cumulative reward
    total_r = 0

    #iterate rewards from Gt to G0
    for r in reversed(rewards):

        #Base case: G(T) = r(T)
        #Recursive: G(t) = r(t) + G(t+1)^DISCOUNT
        total_r = r + total_r * DISCOUNT_FACTOR

        #add to front of G
        G.insert(0, total_r)

    #whitening rewards
    G = torch.tensor(G).to(DEVICE)
    G = (G - G.mean())/G.std()

    return G

In [8]:
def train_policy(deltas, log_probs, optimizer):
    ''' Update policy parameters
    Args:
    - deltas (Array): difference between predicted stateval and actual stateval (Gt)
    - log_probs (Array): memory of log probabilities of action taken
    - optimizer (Pytorch optimizer): optimizer to update policy network parameters
    '''

    #store updates
    policy_loss = []

    #calculate loss to be backpropagated
    for d, lp in zip(deltas, log_probs):
        #add negative sign since we are performing gradient ascent
        policy_loss.append(-d * lp)

    #Backpropagation
    optimizer.zero_grad()
    sum(policy_loss).backward()
    optimizer.step()


In [9]:
def val_update(G, state_vals, optimizer):
    ''' Update state-value network parameters
    Args:
    - G (Array): memory of cumulative discounted rewards
    - state_vals (Array): memory of predicted state-value at each step
    - optimizer (Pytorch optimizer): optimizer to update state-value network parameters
    '''

    #calculate MSE loss
    val_loss = F.mse_loss(state_vals, G)

    #Backpropagate
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [11]:
def objective(trial):
  #Make environment

  policylr = trial.suggest_categorical('pollr', [0.001, 0.002, 0.0005, 0.0009])
  valuelr = trial.suggest_categorical('valuelr', [0.001, 0.002, 0.0005,0.0009 ])
  fp = trial.suggest_categorical('fp', [16,32,128])
  fs = trial.suggest_categorical('fs', [16,32,128])

  env = gym.make('Acrobot-v1')




  seed_avg= []
  seed_score=[]
  for j in range(1):
    seed= np.random.randint(0,100)
    np.random.seed(10)
    policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, fp, seed).to(DEVICE)
    stateval_network = StateValueNetwork(env.observation_space.shape[0], fs, seed).to(DEVICE)


    #Init optimizer
    policy_optimizer = optim.Adam(policy_network.parameters(), lr=policylr)
    stateval_optimizer = optim.Adam(stateval_network.parameters(), lr=valuelr)


    #track scores
    scores = []
    avg_score= []
    avg_score_history= []
    score_history = []

    #recent 100 scores
    recent_scores = deque(maxlen=100)

    #iterate through episodes
    for episode in tqdm_notebook(range(NUM_EPISODES)):

      #reset environment, initiable variables
      state = env.reset()
      memory = []
      score = 0


        #generate episode
      for step in range(MAX_STEPS):
          #env.render()

          #select action
          action, lp = select_action(policy_network, state)

          #execute action
          new_state, reward, done, _ = env.step(action)

          #track episode score
          score += reward

          #store into memory
          memory.append([state, action, reward, lp])

          #end episode
          if done:
              break

          #move into new state
          state = new_state

      #append score
      scores.append(score)
      avg_score = np.mean(scores[-100:])
      avg_score_history.append(avg_score)

      #get items from memory
      states = [step[0] for step in memory]
      actions = [step[1] for step in memory]
      rewards = [step[2] for step in memory]
      lps = [step[3] for step in memory]

      #get discounted rewards
      G = return_calc(rewards)
      #G = torch.tensor(G).to(DEVICE)

      #calculate state values and train statevalue network
      state_vals = []
      for state in states:
          state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
          state_vals.append(stateval_network(state))

      state_vals = torch.stack(state_vals).squeeze()
      val_update(G, state_vals, stateval_optimizer)


      deltas = [gt - val for gt, val in zip(G, state_vals)]
      deltas = torch.tensor(deltas).to(DEVICE)

      train_policy(deltas, lps, policy_optimizer)
 

    seed_avg.append(np.array(avg_score_history))
    seed_score.append(np.array(scores))
    env.close()

  mean_seed_avg= np.mean(seed_avg, axis=0)
  mean_seed_score= np.mean(seed_score, axis=0)

  score= np.mean(mean_seed_score[-100:])

  return -100 -score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Output best hyperparameters and performance
print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)


[I 2024-04-07 13:30:26,196] A new study created in memory with name: no-name-64e5fb9c-a85b-4684-9d71-733dc31f61cf
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for episode in tqdm_notebook(range(NUM_EPISODES)):


  0%|          | 0/500 [00:00<?, ?it/s]

  if not isinstance(terminated, (bool, np.bool8)):
[I 2024-04-07 13:33:43,246] Trial 0 finished with value: 21.560000000000002 and parameters: {'pollr': 0.0009, 'valuelr': 0.002}. Best is trial 0 with value: 21.560000000000002.
  deprecation(
  deprecation(


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 13:39:27,083] Trial 1 finished with value: 400.0 and parameters: {'pollr': 0.002, 'valuelr': 0.0005}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 13:45:13,861] Trial 2 finished with value: 400.0 and parameters: {'pollr': 0.0005, 'valuelr': 0.0005}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 13:50:58,531] Trial 3 finished with value: 400.0 and parameters: {'pollr': 0.0009, 'valuelr': 0.0005}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 13:56:43,525] Trial 4 finished with value: 400.0 and parameters: {'pollr': 0.0009, 'valuelr': 0.001}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 14:02:33,895] Trial 5 finished with value: 400.0 and parameters: {'pollr': 0.002, 'valuelr': 0.0005}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 14:08:16,879] Trial 6 finished with value: 400.0 and parameters: {'pollr': 0.002, 'valuelr': 0.001}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 14:14:00,215] Trial 7 finished with value: 400.0 and parameters: {'pollr': 0.001, 'valuelr': 0.001}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 14:19:45,359] Trial 8 finished with value: 400.0 and parameters: {'pollr': 0.0009, 'valuelr': 0.0005}. Best is trial 0 with value: 21.560000000000002.


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-04-07 14:25:29,999] Trial 9 finished with value: 400.0 and parameters: {'pollr': 0.0005, 'valuelr': 0.0009}. Best is trial 0 with value: 21.560000000000002.


Best hyperparameters: {'pollr': 0.0009, 'valuelr': 0.002}
Best score: 21.560000000000002
