In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [None]:
#discount factor for future utilities
DISCOUNT_FACTOR = 0.99
gamma = 0.99
#number of episodes to run
NUM_EPISODES = 5000

#max steps per episode
MAX_STEPS = 10000

#score agent needs for environment to be solved
SOLVED_SCORE = 475

#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  and should_run_async(code)


In [None]:
class PolicyNetwork(nn.Module):
  def __init__(self, observation_space, action_space,seed,layer_size):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, action_space)
  def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=-1)
        return action_probs

In [None]:

class StateValueNetwork(nn.Module):
    def __init__(self, observation_space,seed,layer_size):
        super(StateValueNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, 1)
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

In [None]:
def process_rewards(rewards,gamma):
    G = []
    total_r = 0
    for r in reversed(rewards):
        total_r = r + total_r * DISCOUNT_FACTOR
        G.insert(0, total_r)
    G = torch.tensor(G)
    #G = (G - G.mean())/G.std()
    return G

In [None]:
def train_value(G, state_vals, optimizer):
    val_loss = F.mse_loss(state_vals, G)
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [None]:
#Make environment
env = gym.make('Acrobot-v1')
#env = gym.make('CartPole-v1')
#Init network
'''
policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
stateval_network = StateValueNetwork(env.observation_space.shape[0])'''


#Init optimizer


  deprecation(
  deprecation(


'\npolicy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)\nstateval_network = StateValueNetwork(env.observation_space.shape[0])'

In [None]:
pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/107.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.10.1


In [None]:
seed_list = [1,42,30,25,17]

In [None]:
def reset_weights(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [None]:
from skopt import gp_minimize

# Define the objective function
def objective(params):

    regret_avg = 0
    print(params,"Its just getting started ")
    for i in range(5):
      seed = seed_list[i]
      policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n,seed,params["network_size"])
      stateval_network = StateValueNetwork(env.observation_space.shape[0],seed,params["network_size"])
      reset_weights(policy_network)
      reset_weights(stateval_network)
      policy_optimizer = optim.Adam(policy_network.parameters(), params["LR"])
      stateval_optimizer = optim.Adam(stateval_network.parameters(), params["LR"])
      ep = 0
      action_space = np.arange(env.action_space.n)
      total_rewards = []
      regret = 0
      while ep < NUM_EPISODES:
        state = env.reset()
        states = []
        rewards = []
        actions = []
        done = False
        while done == False:
          state = torch.from_numpy(state).float().unsqueeze(0)[0]
          action_probability = policy_network.forward(state).detach().numpy()
          '''if np.isnan(action_probability).any():
            return -float('inf')
          else:'''
          action = np.random.choice(action_space,p=action_probability)
          state.detach()
          next_state,r,done,_ = env.step(action)
          states.append(state)
          rewards.append(r)
          actions.append(action)
          if done :
            break
          state = next_state

        total_rewards.append(sum(rewards))
        G = process_rewards(rewards,gamma)
        G = torch.FloatTensor(G)

        rewards = torch.FloatTensor(rewards)

        policy_optimizer.zero_grad()
        state_vals = []

        for st in states:
            st = st.float().unsqueeze(0)[0]

            state_vals.append(stateval_network(st))
        state_vals = torch.stack(state_vals).squeeze()
        #print(state_vals)
        train_value(G, state_vals, stateval_optimizer)

        deltas = [gt for gt in zip(G)]
        deltas = torch.tensor(deltas)
        logprob = [torch.log(policy_network.forward(states[i])) for i in range(len(deltas))]
        policy_loss = []
        for i in range(len(deltas)):

              d = deltas[i]

              lp = logprob[i][actions[i]]

              policy_loss.append(-d * lp)
        policy_optimizer.zero_grad()
        #print(policy_loss,len(actions))
        sum(policy_loss).backward()
        torch.nn.utils.clip_grad_norm_(policy_network.parameters(), 2)
        policy_optimizer.step()
        avg_rewards = np.mean(total_rewards[-100:])
        ep +=1
        if ep % 400  == 0:
          print("Ep:",ep,"last 100 episodes reward is  :",avg_rewards, end="\n")

        regret -=  avg_rewards
      print(params,regret)
      regret_avg += regret
    print("regret_avg:",regret_avg,"for",params)
    return regret_avg








In [None]:
'''
from skopt import gp_minimize
from skopt.space import Real, Categorical

# Define the search space
space = [Real(0.0001, 0.01, name='lr'),  # Continuous parameter from 0.01 to 0.0001
         Categorical(categories=[16, 32, 64, 128, 256], name='layer_size')]   #
         '''

  and should_run_async(code)


"\nfrom skopt import gp_minimize\nfrom skopt.space import Real, Categorical\n\n# Define the search space\nspace = [Real(0.0001, 0.01, name='lr'),  # Continuous parameter from 0.01 to 0.0001\n         Categorical(categories=[16, 32, 64, 128, 256], name='layer_size')]   #\n         "

In [None]:
#bounds = [(1e-4, 1e-2),]
'''
# Run the Bayesian optimization
res = gp_minimize(objective, space, n_calls=10, random_state=0)
print(res)
# Print the best parameters
print(f"Best parameters: learning rate = {res.x[0]}")
'''

'\n# Run the Bayesian optimization\nres = gp_minimize(objective, space, n_calls=10, random_state=0)\nprint(res)\n# Print the best parameters\nprint(f"Best parameters: learning rate = {res.x[0]}")\n'

In [None]:
!pip install wandb


Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.2/2.2 MB[0m [31m42.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m26.3 MB/s[0

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import wandb

def main():
    wandb.init(project="RLA2ACROBOT-wobase")
    #agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0, config=wandb.config)
    score = objective(wandb.config)
    wandb.log({"score": score})

# 2: Define the search space
sweep_configuration = {
    "method": "bayes",
    "metric": {"goal": "minimize", "name": "score"},
    "parameters": {
        "LR": {"max": 1e-3, "min": 1e-6},
        "network_size": {"values": [64,256,128]},
    },
}

# 3: Start the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project="RLA2ACROBOT-wobase")

wandb.agent(sweep_id, function=main, count=10)

  return LooseVersion(v) >= LooseVersion(check)


Create sweep with ID: 797nd2sd
Sweep URL: https://wandb.ai/deep_learning_duri/RLA2ACROBOT-wobase/sweeps/797nd2sd


[34m[1mwandb[0m: Agent Starting Run: uly7ncrj with config:
[34m[1mwandb[0m: 	LR: 0.0004413271766135506
[34m[1mwandb[0m: 	network_size: 256
[34m[1mwandb[0m: Currently logged in as: [33mdhurilkun[0m ([33mdeep_learning_duri[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'LR': 0.0004413271766135506, 'network_size': 256} Its just getting started 


  if not isinstance(terminated, (bool, np.bool8)):


Ep: 400 last 100 episodes reward is  : -214.78
Ep: 800 last 100 episodes reward is  : -244.99
Ep: 1200 last 100 episodes reward is  : -431.84
Ep: 1600 last 100 episodes reward is  : -239.48
Ep: 2000 last 100 episodes reward is  : -204.11
Ep: 2400 last 100 episodes reward is  : -140.58
Ep: 2800 last 100 episodes reward is  : -317.14
Ep: 3200 last 100 episodes reward is  : -500.0
Ep: 3600 last 100 episodes reward is  : -500.0
Ep: 4000 last 100 episodes reward is  : -462.96
Ep: 4400 last 100 episodes reward is  : -338.41
Ep: 4800 last 100 episodes reward is  : -405.49
{'LR': 0.0004413271766135506, 'network_size': 256} 1617011.5384013979
Ep: 400 last 100 episodes reward is  : -323.85
Ep: 800 last 100 episodes reward is  : -371.22
Ep: 1200 last 100 episodes reward is  : -208.94
Ep: 1600 last 100 episodes reward is  : -143.25
Ep: 2000 last 100 episodes reward is  : -172.59
Ep: 2400 last 100 episodes reward is  : -140.81
Ep: 2800 last 100 episodes reward is  : -135.99
Ep: 3200 last 100 episod

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,6585699.84951


[34m[1mwandb[0m: Agent Starting Run: zu91uzvc with config:
[34m[1mwandb[0m: 	LR: 0.00023326353983478152
[34m[1mwandb[0m: 	network_size: 128


{'LR': 0.00023326353983478152, 'network_size': 128} Its just getting started 
Ep: 400 last 100 episodes reward is  : -264.84
Ep: 800 last 100 episodes reward is  : -253.58
Ep: 1200 last 100 episodes reward is  : -178.47
Ep: 1600 last 100 episodes reward is  : -163.1
Ep: 2000 last 100 episodes reward is  : -158.98
Ep: 2400 last 100 episodes reward is  : -181.43
Ep: 2800 last 100 episodes reward is  : -198.65
Ep: 3200 last 100 episodes reward is  : -209.42
Ep: 3600 last 100 episodes reward is  : -194.36
Ep: 4000 last 100 episodes reward is  : -189.7
Ep: 4400 last 100 episodes reward is  : -228.5
Ep: 4800 last 100 episodes reward is  : -271.26
{'LR': 0.00023326353983478152, 'network_size': 128} 1082669.3125482886
Ep: 400 last 100 episodes reward is  : -498.68
Ep: 800 last 100 episodes reward is  : -351.08
Ep: 1200 last 100 episodes reward is  : -250.25
Ep: 1600 last 100 episodes reward is  : -259.03
Ep: 2000 last 100 episodes reward is  : -158.27
Ep: 2400 last 100 episodes reward is  : -1

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,5202277.04197


[34m[1mwandb[0m: Agent Starting Run: avq38fyq with config:
[34m[1mwandb[0m: 	LR: 0.00094724233935401
[34m[1mwandb[0m: 	network_size: 128


{'LR': 0.00094724233935401, 'network_size': 128} Its just getting started 
Ep: 400 last 100 episodes reward is  : -181.09
Ep: 800 last 100 episodes reward is  : -247.57
Ep: 1200 last 100 episodes reward is  : -201.48
Ep: 1600 last 100 episodes reward is  : -117.8
Ep: 2000 last 100 episodes reward is  : -154.07
Ep: 2400 last 100 episodes reward is  : -179.51
Ep: 2800 last 100 episodes reward is  : -273.83
Ep: 3200 last 100 episodes reward is  : -249.28
Ep: 3600 last 100 episodes reward is  : -163.92
Ep: 4000 last 100 episodes reward is  : -169.85
Ep: 4400 last 100 episodes reward is  : -169.35
Ep: 4800 last 100 episodes reward is  : -155.52
{'LR': 0.00094724233935401, 'network_size': 128} 1015783.5391180273
Ep: 400 last 100 episodes reward is  : -192.99
Ep: 800 last 100 episodes reward is  : -164.28
Ep: 1200 last 100 episodes reward is  : -214.38
Ep: 1600 last 100 episodes reward is  : -208.26
Ep: 2000 last 100 episodes reward is  : -393.89
Ep: 2400 last 100 episodes reward is  : -351.2

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,5859864.69844


[34m[1mwandb[0m: Agent Starting Run: 0s3jbjb5 with config:
[34m[1mwandb[0m: 	LR: 0.0006829462141650387
[34m[1mwandb[0m: 	network_size: 128


{'LR': 0.0006829462141650387, 'network_size': 128} Its just getting started 
Ep: 400 last 100 episodes reward is  : -253.55
Ep: 800 last 100 episodes reward is  : -169.42
Ep: 1200 last 100 episodes reward is  : -192.27
Ep: 1600 last 100 episodes reward is  : -191.24
Ep: 2000 last 100 episodes reward is  : -165.07
Ep: 2400 last 100 episodes reward is  : -165.73
Ep: 2800 last 100 episodes reward is  : -132.73
Ep: 3200 last 100 episodes reward is  : -143.71
Ep: 3600 last 100 episodes reward is  : -138.47
Ep: 4000 last 100 episodes reward is  : -174.07
Ep: 4400 last 100 episodes reward is  : -176.85
Ep: 4800 last 100 episodes reward is  : -157.4
{'LR': 0.0006829462141650387, 'network_size': 128} 956918.9199562166
Ep: 400 last 100 episodes reward is  : -306.02
Ep: 800 last 100 episodes reward is  : -267.66
Ep: 1200 last 100 episodes reward is  : -210.0
Ep: 1600 last 100 episodes reward is  : -154.05
Ep: 2000 last 100 episodes reward is  : -144.79
Ep: 2400 last 100 episodes reward is  : -143

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,5659038.29486


[34m[1mwandb[0m: Agent Starting Run: zdylpf27 with config:
[34m[1mwandb[0m: 	LR: 0.00039232825543089166
[34m[1mwandb[0m: 	network_size: 256


{'LR': 0.00039232825543089166, 'network_size': 256} Its just getting started 
Ep: 400 last 100 episodes reward is  : -237.2
Ep: 800 last 100 episodes reward is  : -151.62
Ep: 1200 last 100 episodes reward is  : -136.48
Ep: 1600 last 100 episodes reward is  : -135.25
Ep: 2000 last 100 episodes reward is  : -171.28
Ep: 2400 last 100 episodes reward is  : -159.61
Ep: 2800 last 100 episodes reward is  : -141.82
Ep: 3200 last 100 episodes reward is  : -138.42
Ep: 3600 last 100 episodes reward is  : -189.9
Ep: 4000 last 100 episodes reward is  : -206.19
Ep: 4400 last 100 episodes reward is  : -159.1
Ep: 4800 last 100 episodes reward is  : -155.87
{'LR': 0.00039232825543089166, 'network_size': 256} 867591.0754952182
Ep: 400 last 100 episodes reward is  : -339.32
Ep: 800 last 100 episodes reward is  : -160.11
Ep: 1200 last 100 episodes reward is  : -133.49
Ep: 1600 last 100 episodes reward is  : -143.81
Ep: 2000 last 100 episodes reward is  : -170.32
Ep: 2400 last 100 episodes reward is  : -18

In [None]:
import matplotlib.pyplot as plt

In [None]:
'''
plt.plot(total_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')

plt.show()'''
