In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [None]:
#discount factor for future utilities
DISCOUNT_FACTOR = 0.99
gamma = 0.99
#number of episodes to run
NUM_EPISODES = 5000

#max steps per episode
MAX_STEPS = 10000



#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  and should_run_async(code)


In [None]:
class PolicyNetwork(nn.Module):
  def __init__(self, observation_space, action_space,seed,layer_size):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, action_space)
  def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=-1)
        return action_probs

In [None]:

class StateValueNetwork(nn.Module):
    def __init__(self, observation_space,seed,layer_size):
        super(StateValueNetwork, self).__init__()
        self.input_layer = nn.Linear(observation_space, 128)
        self.output_layer = nn.Linear(128, 1)
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

In [None]:
def process_rewards(rewards,gamma):
    G = []
    total_r = 0
    for r in reversed(rewards):
        total_r = r + total_r * DISCOUNT_FACTOR
        G.insert(0, total_r)
    G = torch.tensor(G)
    #G = (G - G.mean())/G.std()
    return G

In [None]:
def train_value(G, state_vals, optimizer):
    val_loss = F.mse_loss(state_vals, G)
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [None]:
seed_list = [1,42,30,25,17]

In [None]:
def reset_weights(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [None]:
env = gym.make('Acrobot-v1')


  deprecation(
  deprecation(


In [None]:
def avg_over_5_runs(params):
  avg_regret = 0
  for i in range(5):
    regret = 0
    seed = seed_list[i]
    #Init network
    policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n,seed,params["network_size"])
    stateval_network = StateValueNetwork(env.observation_space.shape[0],seed,params["network_size"])
    reset_weights(policy_network)
    reset_weights(stateval_network)

    #Init optimizer
    policy_optimizer = optim.Adam(policy_network.parameters(), lr=params["LR"])
    stateval_optimizer = optim.Adam(stateval_network.parameters(), lr=params["LR"])

    ep = 0
    action_space = np.arange(env.action_space.n)
    total_rewards1 = []
    while ep < NUM_EPISODES:
      state = env.reset()
      states = []
      rewards = []
      actions = []
      done = False
      while done == False:
        state = torch.from_numpy(state).float().unsqueeze(0)[0]
        action_probability = policy_network.forward(state).detach().numpy()
        if np.isnan(action_probability).any():
          print("oh",avg_regret/i)
          return avg_regret / i
        '''
        if np.isnan(action_probability).any():
          print("hi")
          state = env.reset()
          states = []
          rewards = []
          actions = []
          done = False
          state = torch.from_numpy(state).float().unsqueeze(0)[0]
          action_probability = policy_network.forward(state).detach().numpy()


        action_probability[np.isnan(action_probability)] = 0.0  # Replace NaN with 0.0'''

        action = np.random.choice(action_space,p=action_probability)
        state.detach()
        next_state,r,done,_ = env.step(action)
        states.append(state)
        rewards.append(r)
        actions.append(action)
        if done :
          break
        state = next_state
      total_rewards1.append(sum(rewards))
      G = process_rewards(rewards,gamma)
      G = torch.FloatTensor(G)

      rewards = torch.FloatTensor(rewards)

      policy_optimizer.zero_grad()
      state_vals = []

      for st in states:
          st = st.float().unsqueeze(0)[0]

          state_vals.append(stateval_network(st))
      state_vals = torch.stack(state_vals).squeeze()
      #print(state_vals)
      train_value(G, state_vals, stateval_optimizer)

      deltas = [gt - val for gt, val in zip(G, state_vals)]
      deltas = torch.tensor(deltas)
      logprob = [torch.log(policy_network.forward(states[i])) for i in range(len(deltas))]
      policy_loss = []
      for i in range(len(deltas)):

            d = deltas[i]

            lp = logprob[i][actions[i]]

            policy_loss.append(-d * lp)
      policy_optimizer.zero_grad()
      #print(policy_loss,len(actions))
      sum(policy_loss).backward()
      torch.nn.utils.clip_grad_norm_(policy_network.parameters(), 5)
      policy_optimizer.step()
      avg_rewards = np.mean(total_rewards1[-100:])
      regret = regret - avg_rewards

      ep +=1
      if ep%400 == 0:
          print("Ep:",ep,"last 100 episodes reward is  :",avg_rewards, end="\n")
    avg_regret = avg_regret + regret
    print("problem solved at episode",ep)

  return avg_regret / 5



In [None]:
'''
bounds = [(1e-4, 1e-2)]

# Run the Bayesian optimization
res = gp_minimize(avg_over_5_runs, bounds, n_calls=10, random_state=0)
print(res)
# Print the best parameters
print(f"Best parameters: learning rate = {res.x[0]}")
'''

'\nbounds = [(1e-4, 1e-2)]\n\n# Run the Bayesian optimization\nres = gp_minimize(avg_over_5_runs, bounds, n_calls=10, random_state=0)\nprint(res)\n# Print the best parameters\nprint(f"Best parameters: learning rate = {res.x[0]}")\n'

In [None]:

import matplotlib.pyplot as plt


In [None]:
!pip install wandb


Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import wandb

def main():
    wandb.init(project="RLA2acrobot-withbase")
    #agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0, config=wandb.config)
    score = avg_over_5_runs(wandb.config)
    wandb.log({"score": score})

# 2: Define the search space
sweep_configuration = {
    "method": "bayes",

    "metric": {"goal": "minimize", "name": "score"},
    "parameters": {
        "LR": {"max": 1e-3, "min": 1e-6},
        "network_size": {"values": [64,256,128]},
    },
}

# 3: Start the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project="RLA2acrobot-withbase")

wandb.agent(sweep_id, function=main, count=10)

  return LooseVersion(v) >= LooseVersion(check)


Create sweep with ID: 46vsm7aa
Sweep URL: https://wandb.ai/deep_learning_duri/RLA2acrobot-withbase/sweeps/46vsm7aa


[34m[1mwandb[0m: Agent Starting Run: enzmehgm with config:
[34m[1mwandb[0m: 	LR: 0.0002534149211734855
[34m[1mwandb[0m: 	network_size: 64
[34m[1mwandb[0m: Currently logged in as: [33mdhurilkun[0m ([33mdeep_learning_duri[0m). Use [1m`wandb login --relogin`[0m to force relogin


  if not isinstance(terminated, (bool, np.bool8)):


Ep: 400 last 100 episodes reward is  : -477.89
Ep: 800 last 100 episodes reward is  : -286.55
Ep: 1200 last 100 episodes reward is  : -147.29
Ep: 1600 last 100 episodes reward is  : -122.49
Ep: 2000 last 100 episodes reward is  : -109.08
Ep: 2400 last 100 episodes reward is  : -103.09
Ep: 2800 last 100 episodes reward is  : -103.87
Ep: 3200 last 100 episodes reward is  : -95.36
Ep: 3600 last 100 episodes reward is  : -96.63
Ep: 4000 last 100 episodes reward is  : -92.11
Ep: 4400 last 100 episodes reward is  : -89.58
Ep: 4800 last 100 episodes reward is  : -87.24
problem solved at episode 5000
Ep: 400 last 100 episodes reward is  : -454.1
Ep: 800 last 100 episodes reward is  : -243.73
Ep: 1200 last 100 episodes reward is  : -153.51
Ep: 1600 last 100 episodes reward is  : -115.89
Ep: 2000 last 100 episodes reward is  : -109.84
Ep: 2400 last 100 episodes reward is  : -103.15
Ep: 2800 last 100 episodes reward is  : -95.16
Ep: 3200 last 100 episodes reward is  : -95.55
Ep: 3600 last 100 epi

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,788169.40773


[34m[1mwandb[0m: Agent Starting Run: ttnogjjq with config:
[34m[1mwandb[0m: 	LR: 3.873820370164585e-05
[34m[1mwandb[0m: 	network_size: 64


Ep: 400 last 100 episodes reward is  : -500.0
Ep: 800 last 100 episodes reward is  : -497.45
Ep: 1200 last 100 episodes reward is  : -497.64
Ep: 1600 last 100 episodes reward is  : -487.99
Ep: 2000 last 100 episodes reward is  : -459.02
Ep: 2400 last 100 episodes reward is  : -433.83
Ep: 2800 last 100 episodes reward is  : -401.38
Ep: 3200 last 100 episodes reward is  : -373.01
Ep: 3600 last 100 episodes reward is  : -333.89
Ep: 4000 last 100 episodes reward is  : -297.46
Ep: 4400 last 100 episodes reward is  : -269.27
Ep: 4800 last 100 episodes reward is  : -256.77
problem solved at episode 5000
Ep: 400 last 100 episodes reward is  : -497.48
Ep: 800 last 100 episodes reward is  : -497.5
Ep: 1200 last 100 episodes reward is  : -483.12
Ep: 1600 last 100 episodes reward is  : -492.8
Ep: 2000 last 100 episodes reward is  : -487.47
Ep: 2400 last 100 episodes reward is  : -467.0
Ep: 2800 last 100 episodes reward is  : -454.53
Ep: 3200 last 100 episodes reward is  : -458.67
Ep: 3600 last 100

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,1979737.17329


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: n5damx5e with config:
[34m[1mwandb[0m: 	LR: 0.00025942125670434384
[34m[1mwandb[0m: 	network_size: 256


Ep: 400 last 100 episodes reward is  : -413.07
Ep: 800 last 100 episodes reward is  : -309.13
Ep: 1200 last 100 episodes reward is  : -175.29
Ep: 1600 last 100 episodes reward is  : -113.85
Ep: 2000 last 100 episodes reward is  : -95.54
Ep: 2400 last 100 episodes reward is  : -93.68
Ep: 2800 last 100 episodes reward is  : -84.63
Ep: 3200 last 100 episodes reward is  : -92.21
Ep: 3600 last 100 episodes reward is  : -82.24
Ep: 4000 last 100 episodes reward is  : -94.02
Ep: 4400 last 100 episodes reward is  : -83.89
Ep: 4800 last 100 episodes reward is  : -85.79
problem solved at episode 5000
Ep: 400 last 100 episodes reward is  : -312.24
Ep: 800 last 100 episodes reward is  : -196.78
Ep: 1200 last 100 episodes reward is  : -129.7
Ep: 1600 last 100 episodes reward is  : -101.7
Ep: 2000 last 100 episodes reward is  : -99.04
Ep: 2400 last 100 episodes reward is  : -89.42
Ep: 2800 last 100 episodes reward is  : -87.92
Ep: 3200 last 100 episodes reward is  : -90.75
Ep: 3600 last 100 episodes 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,5054.87354


[34m[1mwandb[0m: Agent Starting Run: 0q50tfd9 with config:
[34m[1mwandb[0m: 	LR: 0.0005895755467480911
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -342.57
Ep: 800 last 100 episodes reward is  : -98.57
Ep: 1200 last 100 episodes reward is  : -89.62
Ep: 1600 last 100 episodes reward is  : -89.24
Ep: 2000 last 100 episodes reward is  : -81.19
Ep: 2400 last 100 episodes reward is  : -88.99
Ep: 2800 last 100 episodes reward is  : -85.81
Ep: 3200 last 100 episodes reward is  : -84.63
Ep: 3600 last 100 episodes reward is  : -84.74
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


[34m[1mwandb[0m: Agent Starting Run: 3ietz00j with config:
[34m[1mwandb[0m: 	LR: 0.000530962048518704
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -181.7
Ep: 800 last 100 episodes reward is  : -111.76
Ep: 1200 last 100 episodes reward is  : -95.21
Ep: 1600 last 100 episodes reward is  : -88.31
Ep: 2000 last 100 episodes reward is  : -85.0
Ep: 2400 last 100 episodes reward is  : -82.13
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


[34m[1mwandb[0m: Agent Starting Run: b4bn8vmj with config:
[34m[1mwandb[0m: 	LR: 0.0004527844568855271
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -472.14
Ep: 800 last 100 episodes reward is  : -234.03
Ep: 1200 last 100 episodes reward is  : -188.39
Ep: 1600 last 100 episodes reward is  : -174.96
Ep: 2000 last 100 episodes reward is  : -154.81
Ep: 2400 last 100 episodes reward is  : -140.44
Ep: 2800 last 100 episodes reward is  : -94.22
Ep: 3200 last 100 episodes reward is  : -89.07
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


[34m[1mwandb[0m: Agent Starting Run: al3887ci with config:
[34m[1mwandb[0m: 	LR: 0.00048424522065276983
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -186.96
Ep: 800 last 100 episodes reward is  : -115.85
Ep: 1200 last 100 episodes reward is  : -105.58
Ep: 1600 last 100 episodes reward is  : -86.77
Ep: 2000 last 100 episodes reward is  : -84.52
Ep: 2400 last 100 episodes reward is  : -85.0
Ep: 2800 last 100 episodes reward is  : -82.62
Ep: 3200 last 100 episodes reward is  : -81.67
Ep: 3600 last 100 episodes reward is  : -88.01
Ep: 4000 last 100 episodes reward is  : -85.56
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


[34m[1mwandb[0m: Agent Starting Run: 93q8r6vo with config:
[34m[1mwandb[0m: 	LR: 0.0005028690121840254
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -307.41
Ep: 800 last 100 episodes reward is  : -128.15
Ep: 1200 last 100 episodes reward is  : -97.32
Ep: 1600 last 100 episodes reward is  : -91.33
Ep: 2000 last 100 episodes reward is  : -86.16
Ep: 2400 last 100 episodes reward is  : -91.03
Ep: 2800 last 100 episodes reward is  : -81.39
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bfw2t61y with config:
[34m[1mwandb[0m: 	LR: 0.0005041102288079219
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -227.11
Ep: 800 last 100 episodes reward is  : -112.36
Ep: 1200 last 100 episodes reward is  : -99.85
Ep: 1600 last 100 episodes reward is  : -84.31
Ep: 2000 last 100 episodes reward is  : -81.27
Ep: 2400 last 100 episodes reward is  : -85.33
Ep: 2800 last 100 episodes reward is  : -87.99
Ep: 3200 last 100 episodes reward is  : -87.31
Ep: 3600 last 100 episodes reward is  : -84.08
Ep: 4000 last 100 episodes reward is  : -83.91
Ep: 4400 last 100 episodes reward is  : -84.96
Ep: 4800 last 100 episodes reward is  : -82.89
problem solved at episode 5000
Ep: 400 last 100 episodes reward is  : -450.52
Ep: 800 last 100 episodes reward is  : -138.91
Ep: 1200 last 100 episodes reward is  : -94.64
Ep: 1600 last 100 episodes reward is  : -89.28
Ep: 2000 last 100 episodes reward is  : -86.27
Ep: 2400 last 100 episodes reward is  : -81.78
Ep: 2800 last 100 episodes reward is  : -84.5
Ep: 3200 last 100 episodes reward is  : -84.91
oh 3434.6409328847712


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,3434.64093


[34m[1mwandb[0m: Agent Starting Run: g2hestqw with config:
[34m[1mwandb[0m: 	LR: 0.0004245149843280246
[34m[1mwandb[0m: 	network_size: 128


Ep: 400 last 100 episodes reward is  : -301.67
Ep: 800 last 100 episodes reward is  : -127.32
Ep: 1200 last 100 episodes reward is  : -99.07
Ep: 1600 last 100 episodes reward is  : -94.65
Ep: 2000 last 100 episodes reward is  : -88.88
Ep: 2400 last 100 episodes reward is  : -84.23
Ep: 2800 last 100 episodes reward is  : -84.96
Ep: 3200 last 100 episodes reward is  : -82.9
Ep: 3600 last 100 episodes reward is  : -83.52
Ep: 4000 last 100 episodes reward is  : -82.14
oh 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,0.0


In [None]:
'''
plt.plot(total_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show() '''

  and should_run_async(code)


"\nplt.plot(total_rewards)\nplt.plot()\nplt.xlabel('Episode')\nplt.ylabel('Reward')\nplt.show() "