In [140]:
import gym
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.nn.functional import mse_loss

from torch import optim
import copy
from collections import namedtuple

from itertools import count
import math

In [141]:
env = gym.make('LunarLander-v2')

In [142]:
print('Example action {}'.format(env.action_space.sample()))
print('Example observation space {}'.format(env.reset()))

Example action 2
Example observation space [-0.00471716  1.4207451  -0.47780877  0.43665728  0.00547274  0.10823067
  0.          0.        ]


# Understand the Environment

The action space includes 4 discrete actions. (nothing, left burner, right burner, main burner )

The first value controls the main thruster. 
The second value controls the left/right thruster.

Observation space. 8 value vector is also continuous. 

What do the 8 values represent? Don't know but i guess it doesn't really matter. 

This is not a problem where you need to take the image and use that to work out where you are. It's more simple in that you have a vector that tells you the position. 


In [143]:
env.reset()

array([-0.00221558,  1.4143819 , -0.2244257 ,  0.15385094,  0.00257403,
        0.05083577,  0.        ,  0.        ], dtype=float32)

In [144]:
for _ in range(100):
    env.render()
    state, reward, done, _ = env.step(env.action_space.sample())
    #print('state {}'.format(state))
    #print('reward {}'.format(type(reward)))
    #print('done {}'.format(done))
    

In [145]:
env.close()

# Build a Deep Q Learner From Scratch

In [146]:
# reply memory 

# Q network

# Target network

# Training cycle. 

# Replay Memory Buffer
A Deep Q learner is off policy, in that it will learn from a bunch of episodes completed on a policy that is not the current best policy.

Using a replay buffer prevents it from forgetting valuable experience from other episodes and just adjusting overly to what it has just seen. 

In [147]:
Transition = namedtuple('Transition',
                       ('state','action','next_state','reward'))

class replay_memory():
    
    '''
    class will store a bunch of past experiences
    inputs 
        - size - size of memory
    output
        - object that stores (state, action, reward, next_state) tuples.
    '''
    
    def __init__(self, size =1000):
        '''
        empty list in which to store experiences
        '''
        self.storage = []
        self.size = size
        self.position = 0
        
        
    def add_to_memory(self, *args):
        '''
        method will allow a new experience to be pushed into the memory buffer.
        
        inputs - experience- list [state, action, reward, next_state]
        '''     
        # if full memory full remove the first value in the list and then append the new one. 
        if len(self.storage) < self.size:
            # i don't understand the append none part.
            self.storage.append(None)
        self.storage[self.position] = Transition(*args)
        # make sure position is always within the size of the memory.
        self.position = (self.position + 1) % self.size
        
    def sample(self, batch_size):
        '''
        Get a random sample of the memory that can then be used as batch.
        '''
        return random.sample(self.storage, batch_size)

    def __len__(self):
        return len(self.storage)

In [148]:
class deepq_network(nn.Module):
    
    '''
    neural network to implement to deep q learner.
    '''
    def __init__(self):
        super(deepq_network, self).__init__()
        self.main = nn.Sequential(
          nn.Linear(8,128, bias=False),
          nn.ReLU(True),
          nn.Linear(128,256, bias=False),
          nn.ReLU(True),
          nn.Linear(256,128, bias=False),
          nn.ReLU(True),
          nn.Linear(128,64, bias=False),
          nn.ReLU(True),
          nn.Linear(64,32, bias=False),
          nn.ReLU(True),
          # Output layer here is going to be a q value for each of the four actions.
          nn.Linear(32, 4, bias=False),
          ## Need to get clear what the output of this should be.
          nn.Softmax(dim=1)
        )
    
    def Forward(self,input):
        output = self.main(input)
        return output
    

In [149]:
# Action selection - as we use epsilon greedy some actions are random, others take the action that gives the max q value
n_actions = env.action_space.n

def action_selection(state):
    global steps_done
    #epsilon_greedy_approach
    # get a random number between 0 and 1.
    sample = random.random()
    eps_threshold = eps_end + (eps_start - eps_end)* math.exp(-1*(steps_done/eps_decay))
    steps_done += 1
    if sample < epsilon:
        # pick random action
        action = torch.tensor([[random.randrange(n_actions)]], dtype=torch.long)
        return action
    else:
        # I think we're doing this because we don't care about grads.So it's more 
        # memory efficient. 
        with torch.no_grad():
        #pick the action that the q_network thinks will give you the biggest q value. 
        # MAKE SURE STATE IS IN THE CORRECT FORMAT. 
            state_tensor = torch.tensor([state]).to(dtype = torch.float)
            action = Qnet.Forward(state_tensor).max(1)[1].view(1,1)  
        return action
    
    

In [150]:
memory_storage = replay_memory()
Qnet = deepq_network()
Tnet = deepq_network()
# Set the data type
Qnet=Qnet.float()
Tnet= Tnet.float()
Tnet.load_state_dict(Qnet.state_dict())
# you want tnet to be in evaluation mode rather than training mode. 
# for example you wont have dropout. 
Tnet.eval()

epsilon = 0.05
eps_start = 0.9
eps_end = 0.05
eps_decay = 200
copy_frequency = 10
batch_size = 64
GAMMA = 0.999
#
episode_durations = []

steps_done = 0 

#create an optimiser
optimiser = optim.Adam(Qnet.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

In [151]:
def optimise():
    '''
    Function which after each step will run an optimisation of the q network.
    
    '''
    # Before you can optimise the size of memory must to be full
    if len(memory_storage) < batch_size:
        return
    
    transitions = memory_storage.sample(batch_size)
    batch = Transition(*zip(*transitions))
    #print(batch)
    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), dtype=torch.bool)
    
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None]).to(dtype= torch.float)
    
    state_batch = torch.cat(batch.state).to(dtype= torch.float)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward).to(dtype= torch.float)
    
    # What's the predicted Q value for the current state
    # Of these qvalues i need to pick the one at the index of the action we chose. 
    q_value = Qnet.Forward(state_batch).gather(1, action_batch)

    # What about the target_q. reward plus the max q value of the next state.
    next_state_values = torch.zeros(batch_size)
    
    next_state_values[non_final_mask] = Tnet.Forward(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    target_q = (next_state_values * GAMMA) + reward_batch
    
    # What is the loss. The difference between these two. 
    # pytorch loss function
    loss = mse_loss(q_value, target_q.unsqueeze(1))
    
    # Backprop the error in the q network.
    # How do these three link everything together? 
    optimiser.zero_grad()
    loss.backward()
    for param in Qnet.parameters():
        param.grad.data.clamp_(-1, 1)
    optimiser.step()
    return loss


In [153]:
# Think about the training cycle.
episodes = 10
for episode in range(episodes):
    
    state = copy.deepcopy(env.reset())
    for t in count():
        
        # Decide which action to take. 
        action = action_selection(state)

        next_state, reward, done, _ = env.step(action.item())
        
        next_state = list(next_state)
        
        # Observe new state
        if done:
            next_state = None
            next_state_save = None
        else:
            next_state_save = next_state[:]
            next_state_save = torch.tensor([next_state_save])

        reward = torch.tensor([reward]).to(dtype= torch.float)
        state_save = state[:]
        state_save = torch.tensor([state_save])
        
        # save transition into the replay memory.
        memory_storage.add_to_memory(state_save, action, next_state_save, reward)
        
        # optimise the q network networks.
        loss = optimise()

        #check to see if the episode is done
        if done:
            episode_durations.append(t + 1)
            break
        else:
            #Move to the next state
            state = next_state[:]   
            
    # every so many episodes copy over the parameters of the qnet to the target net.
    if episode % copy_frequency == 0:
        Tnet.load_state_dict(Qnet.state_dict())
    if episode % 1000 == 0 :
        print('loss {} at epiode {}'.format(loss,episode))
    
print('Finished')
# Close the environment.
env.close()

loss 168.3443603515625 at epiode 0
Finished


In [158]:
max_steps = 300
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        
        state_tensor = torch.tensor([state]).to(dtype= torch.float)
        action = Qnet.Forward(state_tensor).max(1)[1].view(1, 1)
        new_state, reward, done, info = env.step(action.item())
        env.render()
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            print(reward)
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
-100
Number of steps 96
****************************************************
EPISODE  1
-100
Number of steps 77
****************************************************
EPISODE  2
-100
Number of steps 74
****************************************************
EPISODE  3
-100
Number of steps 102
****************************************************
EPISODE  4
-100
Number of steps 76
