In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [None]:
#discount factor for future utilities
DISCOUNT_FACTOR = 0.99
gamma = 0.99
#number of episodes to run
NUM_EPISODES = 10000

#max steps per episode
MAX_STEPS = 10000

#score agent needs for environment to be solved
SOLVED_SCORE = 475

#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  and should_run_async(code)


In [None]:
class PolicyNetwork(nn.Module):
  def __init__(self, observation_space, action_space,seed,layer_size):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, action_space)
  def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=-1)
        return action_probs

In [None]:

class StateValueNetwork(nn.Module):
    def __init__(self, observation_space,seed,layer_size):
        super(StateValueNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, 1)
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

In [None]:
def process_rewards(rewards,gamma):
    G = []
    total_r = 0
    for r in reversed(rewards):
        total_r = r + total_r * DISCOUNT_FACTOR
        G.insert(0, total_r)
    G = torch.tensor(G)
    #G = (G - G.mean())/G.std()
    return G

In [None]:
def train_value(G, state_vals, optimizer):
    val_loss = F.mse_loss(state_vals, G)
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [None]:
#Make environment
#env = gym.make('Acrobot-v1')
env = gym.make('CartPole-v1')
#Init network
'''
policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
stateval_network = StateValueNetwork(env.observation_space.shape[0])'''


#Init optimizer


  deprecation(
  deprecation(


'\npolicy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)\nstateval_network = StateValueNetwork(env.observation_space.shape[0])'

In [None]:
pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.10.1


In [None]:
seed_list = [1,42,30,25,17]

In [None]:
def reset_weights(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [None]:
from skopt import gp_minimize

# Define the objective function
def objective(params):

    regret_avg = 0
    print(params,"Its just getting started ")
    for i in range(5):
      seed = seed_list[i]
      policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n,seed,params["network_size"])
      #stateval_network = StateValueNetwork(env.observation_space.shape[0],seed,params["network_size"])
      reset_weights(policy_network)
      #reset_weights(stateval_network)
      policy_optimizer = optim.Adam(policy_network.parameters(), params["LR"])
      #stateval_optimizer = optim.Adam(stateval_network.parameters(), params["LR"])
      ep = 0
      action_space = np.arange(env.action_space.n)
      total_rewards = []
      regret = 0
      while ep < NUM_EPISODES:
        state = env.reset()
        states = []
        rewards = []
        actions = []
        done = False
        while done == False:
          state = torch.from_numpy(state).float().unsqueeze(0)[0]
          action_probability = policy_network.forward(state).detach().numpy()
          if np.isnan(action_probability).any():
            return -float('inf')
          else:
            action = np.random.choice(action_space,p=action_probability)
            state.detach()
            next_state,r,done,_ = env.step(action)
            states.append(state)
            rewards.append(r)
            actions.append(action)
            if done :
              break
            state = next_state

        total_rewards.append(sum(rewards))
        G = process_rewards(rewards,gamma)
        G = torch.FloatTensor(G)

        rewards = torch.FloatTensor(rewards)

        policy_optimizer.zero_grad()
        deltas = [gt for gt in zip(G)]
        deltas = torch.tensor(deltas)
        logprob = [torch.log(policy_network.forward(states[i])) for i in range(len(deltas))]
        policy_loss = []
        for i in range(len(deltas)):

              d = deltas[i]

              lp = logprob[i][actions[i]]

              policy_loss.append(-d * lp)
        policy_optimizer.zero_grad()
        #print(policy_loss,len(actions))
        sum(policy_loss).backward()
        policy_optimizer.step()
        avg_rewards = np.mean(total_rewards[-100:])
        ep +=1
        if ep % 400  == 0:
          print("Ep:",ep,"last 100 episodes reward is  :",avg_rewards, end="\n")
        if avg_rewards > 475:
            print("problem solved at episode",ep)
            break
        regret += 475 - avg_rewards
      print(params,regret)
      regret_avg += regret
    print("regret_avg:",regret_avg,"for",params)
    return regret_avg








In [None]:
'''
from skopt import gp_minimize
from skopt.space import Real, Categorical

# Define the search space
space = [Real(0.0001, 0.01, name='lr'),  # Continuous parameter from 0.01 to 0.0001
         Categorical(categories=[16, 32, 64, 128, 256], name='layer_size')]   #
         '''

  and should_run_async(code)


"\nfrom skopt import gp_minimize\nfrom skopt.space import Real, Categorical\n\n# Define the search space\nspace = [Real(0.0001, 0.01, name='lr'),  # Continuous parameter from 0.01 to 0.0001\n         Categorical(categories=[16, 32, 64, 128, 256], name='layer_size')]   #\n         "

In [None]:
#bounds = [(1e-4, 1e-2),]
'''
# Run the Bayesian optimization
res = gp_minimize(objective, space, n_calls=10, random_state=0)
print(res)
# Print the best parameters
print(f"Best parameters: learning rate = {res.x[0]}")
'''

'\n# Run the Bayesian optimization\nres = gp_minimize(objective, space, n_calls=10, random_state=0)\nprint(res)\n# Print the best parameters\nprint(f"Best parameters: learning rate = {res.x[0]}")\n'

In [None]:
!pip install wandb


Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import wandb

def main():
    wandb.init(project="RLA2cartepole-wobase")
    #agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0, config=wandb.config)
    score = objective(wandb.config)
    wandb.log({"score": score})

# 2: Define the search space
sweep_configuration = {
    "method": "bayes",

    "metric": {"goal": "minimize", "name": "score"},
    "parameters": {
        "LR": {"max": 1e-2, "min": 1e-5},
        "network_size": {"values": [64,256,128]},
    },
}

# 3: Start the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project="RLA2cartepole-wobase")

wandb.agent(sweep_id, function=main, count=10)

  return LooseVersion(v) >= LooseVersion(check)


Create sweep with ID: hmh76vmf
Sweep URL: https://wandb.ai/deep_learning_duri/RLA2cartepole-wobase/sweeps/hmh76vmf


[34m[1mwandb[0m: Agent Starting Run: gpxzb2l9 with config:
[34m[1mwandb[0m: 	LR: 0.004645806369298588
[34m[1mwandb[0m: 	network_size: 128
[34m[1mwandb[0m: Currently logged in as: [33mdhurilkun[0m ([33mdeep_learning_duri[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'LR': 0.004645806369298588, 'network_size': 128} Its just getting started 


  if not isinstance(terminated, (bool, np.bool8)):


Ep: 400 last 100 episodes reward is  : 256.77
problem solved at episode 490
{'LR': 0.004645806369298588, 'network_size': 128} 141680.91117138852
Ep: 400 last 100 episodes reward is  : 274.94
Ep: 800 last 100 episodes reward is  : 241.59
problem solved at episode 886
{'LR': 0.004645806369298588, 'network_size': 128} 259334.08986103907
Ep: 400 last 100 episodes reward is  : 203.57
problem solved at episode 736
{'LR': 0.004645806369298588, 'network_size': 128} 230915.85514226148
Ep: 400 last 100 episodes reward is  : 168.1
Ep: 800 last 100 episodes reward is  : 138.34
Ep: 1200 last 100 episodes reward is  : 450.3
Ep: 1600 last 100 episodes reward is  : 341.7
problem solved at episode 1716
{'LR': 0.004645806369298588, 'network_size': 128} 451552.6458379373
Ep: 400 last 100 episodes reward is  : 110.74
problem solved at episode 614
{'LR': 0.004645806369298588, 'network_size': 128} 194783.56433373253
regret_avg: 1278267.0663463587 for {'LR': 0.004645806369298588, 'network_size': 128}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,1278267.06635


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: is72v6gw with config:
[34m[1mwandb[0m: 	LR: 0.0038212385249032583
[34m[1mwandb[0m: 	network_size: 256


{'LR': 0.0038212385249032583, 'network_size': 256} Its just getting started 
Ep: 400 last 100 episodes reward is  : 11.92
Ep: 800 last 100 episodes reward is  : 83.17
Ep: 1200 last 100 episodes reward is  : 204.27
Ep: 1600 last 100 episodes reward is  : 221.02
Ep: 2000 last 100 episodes reward is  : 137.64
Ep: 2400 last 100 episodes reward is  : 115.26
problem solved at episode 2720
{'LR': 0.0038212385249032583, 'network_size': 256} 958452.8748216162
Ep: 400 last 100 episodes reward is  : 87.98
Ep: 800 last 100 episodes reward is  : 102.89
Ep: 1200 last 100 episodes reward is  : 101.95
Ep: 1600 last 100 episodes reward is  : 110.87
problem solved at episode 1727
{'LR': 0.0038212385249032583, 'network_size': 256} 604892.7964239748
Ep: 400 last 100 episodes reward is  : 9.43
Ep: 800 last 100 episodes reward is  : 9.55
Ep: 1200 last 100 episodes reward is  : 9.55
Ep: 1600 last 100 episodes reward is  : 9.33
Ep: 2000 last 100 episodes reward is  : 9.47
Ep: 2400 last 100 episodes reward is 

VBox(children=(Label(value='0.013 MB of 0.013 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,9125423.94375


[34m[1mwandb[0m: Agent Starting Run: j6iyjx50 with config:
[34m[1mwandb[0m: 	LR: 0.006853077550144784
[34m[1mwandb[0m: 	network_size: 256


{'LR': 0.006853077550144784, 'network_size': 256} Its just getting started 
Ep: 400 last 100 episodes reward is  : 87.39
Ep: 800 last 100 episodes reward is  : 9.35
Ep: 1200 last 100 episodes reward is  : 9.44
Ep: 1600 last 100 episodes reward is  : 9.39
Ep: 2000 last 100 episodes reward is  : 9.33
Ep: 2400 last 100 episodes reward is  : 9.38
Ep: 2800 last 100 episodes reward is  : 9.35
Ep: 3200 last 100 episodes reward is  : 9.59
Ep: 3600 last 100 episodes reward is  : 9.3
Ep: 4000 last 100 episodes reward is  : 9.37
Ep: 4400 last 100 episodes reward is  : 9.36
Ep: 4800 last 100 episodes reward is  : 9.41
Ep: 5200 last 100 episodes reward is  : 9.31
Ep: 5600 last 100 episodes reward is  : 9.38
Ep: 6000 last 100 episodes reward is  : 9.4
Ep: 6400 last 100 episodes reward is  : 9.3
Ep: 6800 last 100 episodes reward is  : 9.47
Ep: 7200 last 100 episodes reward is  : 9.31
Ep: 7600 last 100 episodes reward is  : 9.36
Ep: 8000 last 100 episodes reward is  : 9.5
Ep: 8400 last 100 episodes re

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
score,▁

0,1
score,14995928.10722


[34m[1mwandb[0m: Agent Starting Run: hfsintoc with config:
[34m[1mwandb[0m: 	LR: 0.004698292646577228
[34m[1mwandb[0m: 	network_size: 128


{'LR': 0.004698292646577228, 'network_size': 128} Its just getting started 
Ep: 400 last 100 episodes reward is  : 403.01
problem solved at episode 424
{'LR': 0.004698292646577228, 'network_size': 128} 92589.81004913758
Ep: 400 last 100 episodes reward is  : 152.97
Ep: 800 last 100 episodes reward is  : 446.21
Ep: 1200 last 100 episodes reward is  : 94.1
problem solved at episode 1456
{'LR': 0.004698292646577228, 'network_size': 128} 397661.93824103306
Ep: 400 last 100 episodes reward is  : 81.48
Ep: 800 last 100 episodes reward is  : 88.35
problem solved at episode 1103
{'LR': 0.004698292646577228, 'network_size': 128} 394229.5553144145
problem solved at episode 394
{'LR': 0.004698292646577228, 'network_size': 128} 143361.9515457032
Ep: 400 last 100 episodes reward is  : 140.3
Ep: 800 last 100 episodes reward is  : 142.57
Ep: 1200 last 100 episodes reward is  : 226.06
Ep: 1600 last 100 episodes reward is  : 103.25
Ep: 2000 last 100 episodes reward is  : 110.19
Ep: 2400 last 100 episod

In [None]:
import matplotlib.pyplot as plt

In [None]:
'''
plt.plot(total_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')

plt.show()'''
