In [314]:
import gym
import numpy
import torch
from torch import nn #needed for building neural networks
import torch.nn.functional as F #needed for activation functions
import torch.optim as opt #needed for optimisation


In [315]:
# implement with a sum tree backed by an array so priority experience replay can be added.

# TODO: add priority experience replay 

from collections import deque
import random
import numpy as np


class SumTree:
    write = 0
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = numpy.zeros( 2*capacity - 1 )
        self.data = numpy.zeros( capacity, dtype=object )

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])
    
    # returns the remaining capacity of the sumtree
    def size(self):
        return self.write

In [316]:
class replayBuffer:
    e = 0.01
    a2 = .6
    
    def __init__(self, capacity):        
        self.tree = SumTree(capacity)
        self.capacity = capacity
        
    def _get_priority(self, error):
        return (error + self.e) ** self.a2

    def add(self, error, s, a, r, t, s2):
        experience=(s, a, r, t, s2)
        p = self._get_priority(error)
        self.tree.add(p, experience)

    def sample(self, batch_size):
        batch = []
        segment = self.tree.total()/batch_size
        
        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)
            
            s = random.uniform(a,b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx,data))
        
        loc, batch = map(np.stack, zip(*batch))
        s, a, r, t, s2  = map(np.stack, zip(*batch))
        
        return s, a, r, t, s2
    
    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)   
        
    def size(self):
        size = self.tree.size()
        return size

In [317]:
# 3 fully connected layers with hyperbolic tanget as output activation function
# Takes state and action as input and outputs the Q-value

layer_1=400   #neurons of 1st layers
layer_2=300   #neurons of 2nd layers

def fanin_(size):
    fan_in = size[0]
    weight = 1./np.sqrt(fan_in)
    return torch.Tensor(size).uniform_(-weight, weight)

class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim, h1=layer_1, h2=layer_2, init_w=3e-3):
        '''create the fully connected layers
           nn.Linear(# input nodes, # next layer nodes)
        '''
        super(Critic, self).__init__()
                
        self.linear1 = nn.Linear(state_dim, h1)
        self.linear1.weight.data = fanin_(self.linear1.weight.data.size())
        
        #self.bn1 = nn.BatchNorm1d(h1)
        
        self.linear2 = nn.Linear(h1+action_dim, h2)
        self.linear2.weight.data = fanin_(self.linear2.weight.data.size())
                
        self.linear3 = nn.Linear(h2, 1)
        self.linear3.weight.data.uniform_(-init_w, init_w)

        self.relu = nn.ReLU()
        
    def forward(self, state, action):
        '''the input data is defined as x at each step
            x is replaced at each stage, feeding it to the next level
        '''
        print ('action 1 ' + str(action.size()))
        print ('state 1  ' + str(state.size()))
        x = self.linear1(state)
        x = self.relu(x)
        x = self.linear2(torch.cat([x,action],dim=1)) 
        x = self.relu(x)
        x = self.linear3(x)
        
        return x


class Actor(nn.Module): 
    def __init__(self, state_dim, action_dim, h1=layer_1, h2=layer_2, init_w=0.003):
        super(Actor, self).__init__()
        
        #self.bn0 = nn.BatchNorm1d(state_dim)
        
        self.linear1 = nn.Linear(state_dim, h1)
        self.linear1.weight.data = fanin_(self.linear1.weight.data.size())
        
        #self.bn1 = nn.BatchNorm1d(h1)
        
        self.linear2 = nn.Linear(h1, h2)
        self.linear2.weight.data = fanin_(self.linear2.weight.data.size())
        
        #self.bn2 = nn.BatchNorm1d(h2)
        
        self.linear3 = nn.Linear(h2, action_dim)
        self.linear3.weight.data.uniform_(-init_w, init_w)

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward(self, state):
        #state = self.bn0(state)
        x = self.linear1(state)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.tanh(x)
        return x
    
    def get_action(self, state):
        state  = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.forward(state)
        return action.detach().cpu().numpy()[0]

In [318]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def _action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def _reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)

In [320]:
### NEED TO IMPLEMENT THE ERROR###

### Set Variables ###
environment = 'CartPole-v0'
error = 0  
buffer_size = 1000 # replace with 1000000 after testing
batch_size = 64
gamma = 0.99
tau = 0.001
lra = 0.0001
lrc = 0.001



simulations = 50 # replace with 50000 after testing
steps = 200

# create the memory replay
memory = replayBuffer(buffer_size)

# set up mean square error
MSE = nn.MSELoss()

# set up the gym space
env = gym.make(environment)

                    
# get the shape of the environment
state_dim = env.observation_space.shape[0]

# for discrete action spaces
action_dim = env.action_space.n

# for continuous action spaces
# action_dim = env.action_space.shape[0]

# set up training (actor and critic)
critic  = Critic(state_dim, action_dim).to(device)
actor = Actor(state_dim, action_dim).to(device)

target_critic  = Critic(state_dim, action_dim).to(device)
target_actor = Actor(state_dim, action_dim).to(device)

q_optimizer  = opt.Adam(critic.parameters(),  lr=lrc)#, weight_decay=0.01)
policy_optimizer = opt.Adam(actor.parameters(), lr=lra)

for target_param, param in zip(target_critic.parameters(), critic.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_actor.parameters(), actor.parameters()):
    target_param.data.copy_(param.data)

print("State dim: {}, Action dim: {}".format(state_dim, action_dim))

for i in range(simulations): # number of times to run the simulation
    
    # get the initial observation at simulation start
    initial_observation = env.reset()
    
    # set reward
    ep_reward = 0.
    
    for t in range (steps): # number of time steps in each simulation 
        
        # uncomment to see the simulation
        # env.render()  
        
        # select action (can implement a policy here)
        action = env.action_space.sample() 
        
        # execute action and observe reward, and next state
        new_observation, reward, done, info = env.step(action)
        
        #TODO: add noise to action
        
        
        
        #TODO: add initial error
        
        
        
        # store current step, and next step 
        memory.add(error, initial_observation, action, reward, done, new_observation)  
        
       
        # sample mini-batch of transitions if the memory is bigger than a batch
        if (memory.size() >= batch_size):
            
            #TODO: implement priority sampling.
            
            
            
            
            state_1, action, reward, status, state_2 = memory.sample(batch_size)
            
            # copy data into tensors
            state_1 = torch.FloatTensor(state_1).to(device)
            action = torch.FloatTensor(action).to(device)
            reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
            status = torch.FloatTensor(np.float32(status)).unsqueeze(1).to(device)
            state_2 = torch.FloatTensor(state_2).to(device)         
            print('if loop action')
            print(action.size())
            
            # compute the loss for the critic
            action_2 = target_actor(state_2)
            target_q = target_critic(state_2, action_2)
            y = reward + (1.0 - status) * gamma * target_q.detach()
            q = critic(state_1, action)
            
            q_optimizer.zero_grad()
            q_loss = MSE(q, y) #detach to avoid updating target
            q_loss.backward()
            q_optimizer.step()            
            
            #compute loss for actor
            policy_optimizer.zero_grad()
            policy_loss = -critic(state_1, actor(state_1))
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            policy_optimizer.step()
            
            #soft update of the frozen target networks
            for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                target_param.data.copy_(
                    target_param.data * (1.0 - tau) + param.data * tau
                )

            for target_param, param in zip(target_actor.parameters(), actor.parameters()):
                target_param.data.copy_(
                    target_param.data * (1.0 - tau) + param.data * tau
                )
        # make next step the current step
        initial_observation = new_observation
        ep_reward += reward
        
        if done:
            print('Episode Finishd afer {} timesteps.'.format(t+1))
            break

env.close()

State dim: 4, Action dim: 2
Episode Finishd afer 18 timesteps.
Episode Finishd afer 15 timesteps.
Episode Finishd afer 14 timesteps.
Episode Finishd afer 16 timesteps.
if loop action
torch.Size([64])
action 1 torch.Size([64, 2])
state 1  torch.Size([64, 4])
action 1 torch.Size([64])
state 1  torch.Size([64, 4])


RuntimeError: invalid argument 0: Tensors must have same number of dimensions: got 2 and 1 at /pytorch/aten/src/TH/generic/THTensor.cpp:702

In [None]:
# run on cpu, not set up for GPU as of yet...
cuda = torch.cuda.is_available() #check for CUDA
device   = torch.device("cuda" if cuda else "cpu")
print("Job will run on {}".format(device))

In [None]:
env.close()