In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [None]:
#discount factor for future utilities
DISCOUNT_FACTOR = 0.99
gamma = 0.99
#number of episodes to run
NUM_EPISODES = 10000

#max steps per episode
MAX_STEPS = 10000



#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class PolicyNetwork(nn.Module):
  def __init__(self, observation_space, action_space,seed,layer_size):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_layer = nn.Linear(observation_space, layer_size)
        self.output_layer = nn.Linear(layer_size, action_space)
  def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=-1)
        return action_probs

In [None]:

class StateValueNetwork(nn.Module):
    def __init__(self, observation_space,seed,layer_size):
        super(StateValueNetwork, self).__init__()
        self.input_layer = nn.Linear(observation_space, 128)
        self.output_layer = nn.Linear(128, 1)
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

In [None]:
def process_rewards(rewards,gamma):
    G = []
    total_r = 0
    for r in reversed(rewards):
        total_r = r + total_r * DISCOUNT_FACTOR
        G.insert(0, total_r)
    G = torch.tensor(G)
    #G = (G - G.mean())/G.std()
    return G

In [None]:
def train_value(G, state_vals, optimizer):
    val_loss = F.mse_loss(state_vals, G)
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [None]:
seed_list = [1,42,30,25,17]

In [None]:
def reset_weights(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [None]:
def avg_over_5_runs(params):
  avg_regret = 0
  for i in range(5):
    regret = 0
    seed = seed_list[i]
    env = gym.make('CartPole-v1')
    #Init network
    policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n,seed,params["network_size"])
    stateval_network = StateValueNetwork(env.observation_space.shape[0],seed,params["network_size"])
    reset_weights(policy_network)
    reset_weights(stateval_network)

    #Init optimizer
    policy_optimizer = optim.Adam(policy_network.parameters(), lr=params["LR"])
    stateval_optimizer = optim.Adam(stateval_network.parameters(), lr=params["LR"])

    ep = 0
    action_space = np.arange(env.action_space.n)
    total_rewards1 = []
    while ep < NUM_EPISODES:
      state = env.reset()
      states = []
      rewards = []
      actions = []
      done = False
      while done == False:
        state = torch.from_numpy(state).float().unsqueeze(0)[0]
        action_probability = policy_network.forward(state).detach().numpy()
        if np.isnan(action_probability).any():
          print("hi")
          state = env.reset()
          states = []
          rewards = []
          actions = []
          done = False
          state = torch.from_numpy(state).float().unsqueeze(0)[0]
          action_probability = policy_network.forward(state).detach().numpy()

        action_probability[np.isnan(action_probability)] = 0.0  # Replace NaN with 0.0

        action = np.random.choice(action_space,p=action_probability)
        state.detach()
        next_state,r,done,_ = env.step(action)
        states.append(state)
        rewards.append(r)
        actions.append(action)
        if done :
          break
        state = next_state
      total_rewards1.append(sum(rewards))
      G = process_rewards(rewards,gamma)
      G = torch.FloatTensor(G)

      rewards = torch.FloatTensor(rewards)

      policy_optimizer.zero_grad()
      state_vals = []

      for st in states:
          st = st.float().unsqueeze(0)[0]

          state_vals.append(stateval_network(st))
      state_vals = torch.stack(state_vals).squeeze()
      #print(state_vals)
      train_value(G, state_vals, stateval_optimizer)

      deltas = [gt - val for gt, val in zip(G, state_vals)]
      deltas = torch.tensor(deltas)
      logprob = [torch.log(policy_network.forward(states[i])) for i in range(len(deltas))]
      policy_loss = []
      for i in range(len(deltas)):

            d = deltas[i]

            lp = logprob[i][actions[i]]

            policy_loss.append(-d * lp)
      policy_optimizer.zero_grad()
      #print(policy_loss,len(actions))
      sum(policy_loss).backward()
      policy_optimizer.step()
      avg_rewards = np.mean(total_rewards1[-100:])
      regret = regret  + 475 - avg_rewards

      ep +=1
      if ep%400 == 0:
          print("Ep:",ep,"last 100 episodes reward is  :",avg_rewards, end="\n")
      if avg_rewards > 475:

          break

    avg_regret = avg_regret + regret
    print("problem solved at episode",ep)

  return avg_regret / 5



In [None]:
!pip install scikit-optimize
from skopt import gp_minimize

In [None]:
'''
bounds = [(1e-4, 1e-2)]

# Run the Bayesian optimization
res = gp_minimize(avg_over_5_runs, bounds, n_calls=10, random_state=0)
print(res)
# Print the best parameters
print(f"Best parameters: learning rate = {res.x[0]}")
'''

In [None]:

import matplotlib.pyplot as plt


In [None]:
!pip install wandb


In [None]:
!wandb login

In [None]:
import wandb

def main():
    wandb.init(project="RLA2cartepole-withbase")
    #agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0, config=wandb.config)
    score = avg_over_5_runs(wandb.config)
    wandb.log({"score": score})

# 2: Define the search space
sweep_configuration = {
    "method": "bayes",
    "metric": {"goal": "minimize", "name": "score"},
    "parameters": {
        "LR": {"max": 1e-2, "min": 1e-5},
        "network_size": {"values": [64,256,128]},
    },
}

# 3: Start the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project="RLA2cartepole-withbase")

wandb.agent(sweep_id, function=main, count=10)

In [None]:
'''
plt.plot(total_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show() '''