In [12]:
import gym
import numpy as np
from collections import deque

In [13]:
env = gym.make("LunarLander-v2")

In [14]:
#Actions: 0 - do nothing
#         1 - Fire right engine
#         2 - Fire main engine
#         3 - Fire left engine
print("Action space {}".format(env.action_space))
print("State space {}".format(env.observation_space))

Action space Discrete(4)
State space Box(-inf, inf, (8,), float32)


In [16]:
def relu(mat):
    return np.multiply(mat, (mat>0))

In [17]:
def relu_derivative(mat):
    return (mat>0)*1

In [35]:
class NNLayer:
    #Represents a neural net layer
    def __init__(self, input_size, output_size, activation=None, lr=0.001):
        self.input_size = input_size
        self.output_size = output_size
        self.weights = np.random.uniform(low=-0.5, high=0.5, size=(input_size, output_size))
        self.activation_function = activation
        self.lr = lr
    
    def update_weights(self, gradient):
        self.weights = self.weights - self.lr*gradient
    
    #Backpropagate this layer
    def backward(self, gradient_from_above):
        adjusted_mul = gradient_from_above
        
        #this is pointwise
        if self.activation_function != None:
            adjusted_mul = np.multiply(relu_derivative(self.backward_store_out),gradient_from_above)
            
        #Derivative of the loss function with respect to weights
        D_i = np.dot(np.transpose(np.reshape(self.backward_store_in, (1, len(self.backward_store_in)))), np.reshape(adjusted_mul, (1,len(adjusted_mul))))

        #Calculated error
        delta_i = np.dot(adjusted_mul, np.transpose(self.weights))[:-1]

        self.update_weights(D_i)
        return delta_i
    
    # Compute the forward pass for this layer
    def forward(self, inputs, remember_for_backprop=True):
        #Append a bias term to the input
        input_with_bias = np.append(inputs, 1)
        
        #Product of the input and the weight matrix at this layer
        unactivated = np.dot(input_with_bias, self.weights)
        
        #Send output through an activation function (if defined)
        output = unactivated
        if self.activation_function != None:
            output = self.activation_function(output)
        
        if remember_for_backprop:
            #store variables for backward pass
            self.backward_store_in = input_with_bias
            self.backward_store_out = np.copy(unactivated)
    
        return output

In [48]:
#TEMP
class RLAgent:
    # class representing a reinforcement learning agent
    env = None
    def __init__(self, env):
        self.env = env
        self.hidden_size = 24
        self.input_size = env.observation_space.shape[0]
        self.output_size = env.action_space.n
        self.num_hidden_layers = 2
        self.epsilon = 1.0
        self.memory = deque([],1000000)
        self.gamma = 0.95
        
        self.layers = [NNLayer(self.input_size + 1, self.hidden_size, activation=relu)]
        for i in range(self.num_hidden_layers-1):
            self.layers.append(NNLayer(self.hidden_size+1, self.hidden_size, activation=relu))
        self.layers.append(NNLayer(self.hidden_size+1, self.output_size))
        
    def select_action(self, observation):
        values = self.forward(np.asmatrix(observation))
        if (np.random.random() > self.epsilon):
            return np.argmax(values)
        else:
            return np.random.randint(self.env.action_space.n)
            
    def forward(self, observation, remember_for_backprop=True):
        vals = np.copy(observation)
        index = 0
        for layer in self.layers:
            vals = layer.forward(vals, remember_for_backprop)
            index = index + 1
        return vals
        
    def remember(self, done, action, observation, prev_obs):
        self.memory.append([done, action, observation, prev_obs])
        
    def experience_replay(self, update_size=20):
        if (len(self.memory) < update_size):
            return
        else: 
            batch_indices = np.random.choice(len(self.memory), update_size)
            for index in batch_indices:
                done, action_selected, new_obs, prev_obs = self.memory[index]
                action_values = self.forward(prev_obs, remember_for_backprop=True)
                next_action_values = self.forward(new_obs, remember_for_backprop=False)
                experimental_values = np.copy(action_values)
                if done:
                    experimental_values[action_selected] = -1
                else:
                    experimental_values[action_selected] = 1 + self.gamma*np.max(next_action_values)
                self.backward(action_values, experimental_values)
        self.epsilon = self.epsilon if self.epsilon < 0.01 else self.epsilon*0.997
        for layer in self.layers:
            layer.lr = layer.lr if layer.lr < 0.0001 else layer.lr*0.99
        
    def backward(self, calculated_values, experimental_values): 
        # values are batched = batch_size x output_size
        delta = (calculated_values - experimental_values)
        # print('delta = {}'.format(delta))
        for layer in reversed(self.layers):
            delta = layer.backward(delta)

In [55]:
class RLAgent:
    env = None
    
    def __init__(self, env):
        self.env = env
        self.hidden_size = 24
        self.input_size = env.observation_space.shape[0]
        self.output_size = env.action_space.n
        self.num_hidden_layers = 2
        self.epsilon = 1.0
        
        self.layers = [NNLayer(self.input_size + 1, self.hidden_size, activation=relu)]
        for i in range(self.num_hidden_layers-1):
            self.layers.append(NNLayer(self.hidden_size+1, self.hidden_size, activation=relu))
        self.layers.append(NNLayer(self.hidden_size+1, self.output_size))
        
        self.memory = deque([],1000000)
        self.gamma = 0.95
    
    def remember(self, done, action, observation, prev_obs):
        self.memory.append([done, action, observation, prev_obs])
    
    def experience_replay(self, update_size=20):
        if len(self.memory) < update_size:
            return
        
        #Randomly sample from all stored memories
        batch_indices = np.random.choice(len(self.memory), update_size)
        for index in batch_indices:
            done, action_selected, new_obs, prev_obs = self.memory[index]
            
            action_values = self.forward(prev_obs, remember_for_backprop=True)
            next_action_values = self.forward(new_obs, remember_for_backprop=False)
            experimental_values = np.copy(action_values)
            if done:
                experimental_values[action_selected] = -1
            else:
                experimental_values[action_selected] = 1 + self.gamma*np.max(next_action_values)
            
            #Backpropagate
            self.backward(action_values, experimental_values)
            
        self.epsilon = self.epsilon if self.epsilon < 0.01 else self.epsilon*0.996
        
        for layer in self.layers:
            layer.lr = layer.lr if layer.lr < 0.001 else layer.lr*0.995
    
    #Backpropagate
    def backward(self, calculated_values, experimental_values):
        delta = (calculated_values - experimental_values)
        for layer in reversed(self.layers):
            delta = layer.backward(delta)
    
    #Feeds forward the input observations through the network to get values for each action
    def forward(self, observation, remember_for_backprop=True):
        vals = np.copy(observation)
        index = 0
        for layer in self.layers:
            vals = layer.forward(vals, remember_for_backprop)
            index += 1
        return vals
    
    #Select an action based on the observation
    def select_action(self, observation):
        values = self.forward(np.asmatrix(observation))
        
        if np.random.random() > self.epsilon:
            #Best action
            return np.argmax(values)
        else:
            #Random action
            return np.random.randint(self.env.action_space.n)

In [56]:
NUM_EPISODES = 10000
MAX_TIMESTEPS = 5000
model = RLAgent(env)

#Main loop
for i_episode in range(NUM_EPISODES):
    observation = env.reset()
    
    #Iterating through time steps within an episode
    for t in range(MAX_TIMESTEPS):
        env.render()
        action = model.select_action(observation)
        prev_obs = observation
        observation, reward, done, info = env.step(action)

        #Keep a store of the agent's experiences
        model.remember(done, action, observation, prev_obs)
        model.experience_replay(64)
        
        #epsilon decay
        model.epsilon = model.epsilon if model.epsilon < 0.01 else model.epsilon*0.995
        
        if done:
            print("Episode {} ended after {} timesteps".format(i_episode, t+1))
            #print("Episode {} ended after {} timesteps, current exploration is {}".format(i_episode, t+1, model.epsilon))
            break
            
env.close()

Episode 0 ended after 157 timesteps
Episode 1 ended after 100 timesteps
Episode 2 ended after 78 timesteps
Episode 3 ended after 60 timesteps
Episode 4 ended after 71 timesteps
Episode 5 ended after 116 timesteps
Episode 6 ended after 76 timesteps
Episode 7 ended after 72 timesteps
Episode 8 ended after 133 timesteps
Episode 9 ended after 148 timesteps
Episode 10 ended after 147 timesteps
Episode 11 ended after 173 timesteps
Episode 12 ended after 142 timesteps
Episode 13 ended after 82 timesteps
Episode 14 ended after 103 timesteps
Episode 15 ended after 128 timesteps
Episode 16 ended after 107 timesteps
Episode 17 ended after 112 timesteps
Episode 18 ended after 85 timesteps
Episode 19 ended after 101 timesteps
Episode 20 ended after 108 timesteps
Episode 21 ended after 111 timesteps
Episode 22 ended after 219 timesteps
Episode 23 ended after 86 timesteps
Episode 24 ended after 137 timesteps
Episode 25 ended after 235 timesteps
Episode 26 ended after 84 timesteps
Episode 27 ended aft

KeyboardInterrupt: 

In [59]:
#Play
env.reset()

for t in range(1000):
    env.render()
    action = model.select_action(observation)
    observation, reward, done, info = env.step(action)

    if done:
        break
            
env.close()