In [15]:
%load_ext autoreload
%autoreload 2

from helpers import NormalizedEnv
from helpers import RandomAgent
import gym as gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn

from copy import deepcopy
from tqdm import tqdm
import importlib
from helpers import NormalizedEnv

from heuristicpolicy import HeuristicPendulumAgent
from qnetwork import QNetwork
from replaybuffer import ReplayBuffer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
#Creating pendulum object
pendulum = gym.make('Pendulum-v1', g=9.81)
#Wrapping pendulum to map output space to [-1,1]
pendulum = NormalizedEnv(pendulum)
#Create random agent
Random_Agent = RandomAgent(pendulum)

In [17]:
#set GPU for faster training
cuda = torch.cuda.is_available() #check for CUDA
device   = torch.device("cuda" if cuda else "cpu")
print("Job will run on {}".format(device))

Job will run on cpu


In [18]:
MAX_IT = 200
BUFFER_SIZE = 1e4

BATCH_SIZE = 128
LEARNING_RATE = 1e-4
NUM_EPISODES = 1000
GAMMA = 0.99

In [19]:
torch.manual_seed(-1)

heuristic_agent = HeuristicPendulumAgent(pendulum)
network  = QNetwork().to(device) # critic
memory = ReplayBuffer(max_size=BUFFER_SIZE)
q_optimizer  = optim.Adam(network.parameters(),  lr=LEARNING_RATE)

MSE = nn.MSELoss()

In [20]:
def onestepTD(state_batch, action_batch, reward_batch, next_state_batch, trunc_batch, agent, gamma, network, iter):

    state = torch.FloatTensor(state_batch).to(device)
    action = torch.FloatTensor(action_batch).unsqueeze(1).to(device)
    reward = torch.FloatTensor(np.array(reward_batch)).unsqueeze(1).to(device) #.unsqueeze(1)
    trunc = torch.Tensor(np.float32(trunc_batch)).unsqueeze(1).to(device)
    next_state= torch.FloatTensor(next_state_batch).to(device)
    
    action_next_state = agent.compute_action(state = next_state.T) 

    # To compute the targets in each step, do not use the on-trajectory action, 
    # but compute a new action according to the policy
    
    with torch.no_grad():
        network_input_state = torch.cat((state, action),1)
        network_input_next_state = torch.cat((next_state, torch.Tensor(action_next_state).unsqueeze(1)),1)
        q_next = network(network_input_next_state) # should not be differentiated 

    if iter == MAX_IT:
        q_next = 0
    target_state = reward + gamma * q_next 

    q = network(network_input_state)

    q_optimizer.zero_grad()
    q_loss = MSE(target_state, q)
    q_loss.backward()
    q_optimizer.step()

    return q_loss
    

In [21]:
plot_reward = []
plot_policy = []
plot_q = []
plot_steps = []


best_reward = -np.inf
saved_reward = -np.inf
saved_ep = 0
average_reward = 0
global_step = 0
nr_of_samples = 128


In [22]:
for episode in tqdm(range(NUM_EPISODES)):
    current_state = deepcopy(pendulum.reset()[0])

    ep_reward = 0.
    ep_q_value = 0.
    step = 0

    # collect experience
    for i in range(MAX_IT):
        # how many iterations??
        action = heuristic_agent.compute_action(state = current_state)
        transformed_action = pendulum.action(action)
        next_state, reward, term, trunc, info = pendulum.step(transformed_action)

        memory.add_transition(state = current_state, action = action, reward = reward, next_state = next_state, trunc = trunc)

        if memory.count() > nr_of_samples:
            # sample a batch of transitions from the replay buffer
            state_batch, action_batch, reward_batch, next_state_batch, trunc_batch = memory.sample_transition(nr_of_samples)

            # 1-step TD-learning rule
            q_loss = onestepTD(state_batch, action_batch, reward_batch, next_state_batch, trunc_batch, heuristic_agent, GAMMA, network, i)
                       
        
        current_state = deepcopy(next_state)
        ep_reward += reward
    try:
        plot_reward.append([ep_reward, episode+1])
        plot_q.append([q_loss.data, episode+1])

        if (episode % 100 == 0):
            print(q_loss.item())
    except:
        continue


  0%|          | 1/1000 [00:01<24:35,  1.48s/it]

21.740352630615234


 10%|█         | 101/1000 [06:48<1:17:26,  5.17s/it]

21.805213928222656


 20%|██        | 201/1000 [13:20<48:23,  3.63s/it]  

10.628854751586914


 30%|███       | 301/1000 [19:44<52:55,  4.54s/it]

22.099409103393555


 40%|████      | 401/1000 [26:31<40:09,  4.02s/it]

14.112615585327148


 50%|█████     | 501/1000 [33:16<32:19,  3.89s/it]

17.060672760009766


 60%|██████    | 601/1000 [40:15<29:45,  4.48s/it]

14.915605545043945


 70%|███████   | 701/1000 [46:56<18:54,  3.79s/it]

14.09168529510498


 80%|███████▉  | 798/1000 [53:17<13:20,  3.96s/it]

In [None]:
q = list(zip(*plot_q))
plt.plot(list(q[1]), list(q[0]), 'g') #row=0, col=1
plt.title('The Mean Squared error of the Q value of the critic network')
plt.xlabel('epoch')
plt.ylabel('MSE')