# Policy Gradient Lunar Lander (Discrete Actions)

In [3]:
import gym
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils as nn_utils

import warnings
warnings.filterwarnings('ignore')

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.01
BATCH_SIZE = 10
ENTROPY_BETA = 0.01
CLIP_GRAD = 1.0

class Policy(nn.Module):
    def __init__(self, input_shape, action_shape):
        super(Policy,self).__init__()
        
        self.logits = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.ReLU(),
            nn.Linear(128,action_shape)
        )
    
    def forward(self,x):
        return self.logits(x)

class Policy_A2C(nn.Module):
    def __init__(self, input_shape, action_shape):
        super(Policy_A2C,self).__init__()
        
        self.value = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.ReLU(),
            nn.Linear(128,1)
        )
        
        self.logits = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.ReLU(),
            nn.Linear(128,action_shape)
        )
    
    def forward(self,x):
        return self.logits(x), self.value(x)

In [93]:
def compute_discounted_rewards(epi_rewards):
    batch_R = []
    R = 0.0
    for r in reversed(epi_rewards):
        R = r + GAMMA*R
        batch_R.append(R)
    q = list(reversed(batch_R))
    return q
    #q_mean = np.mean(q)
    #return [q_i - q_mean for q_i in q]

In [94]:
env = gym.make('LunarLander-v2')
#env = gym.make('CartPole-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [101]:
# actions: do nothing, fire left orientation engine, fire main engine, fire right orientation engine
pi_logits = Policy_A2C(env.observation_space.shape[0], env.action_space.n).to(device)
optimizer = optim.Adam(pi_logits.parameters(), lr = LEARNING_RATE)

In [102]:
#state = [x, y, v_x, v_y, theta, theta_dot, left_leg_contact, right_leg_contact]
batch_states, batch_actions, batch_R = [], [], []
batch_rewards = []

total_rewards = collections.deque(maxlen=100)
batch_episodes = 0
done_episodes = 0
step_idx = 0
episode_length = 0

state = env.reset()

prev_mod_r = None

while True:
    step_idx += 1
    episode_length += 1
    
    # select an action based on the current policy
    state_v = torch.FloatTensor([state]).to(device)

    logits_v, _ = pi_logits(state_v)
    prob_a = F.softmax(logits_v, dim=1)
    prob_a = prob_a.data.cpu().numpy()

    a = np.random.choice(env.action_space.n, p=prob_a[0])

    new_state, r, done, _ = env.step(a)
    
    if not done and np.sqrt(state[1]*state[1]+ state[0]*state[0]) < 0.1:
        done = True
        
    
    # store the experience
    batch_states.append(state)
    batch_actions.append(int(a))
    batch_rewards.append(r)

    if done:        
        discounted_rewards = compute_discounted_rewards(batch_rewards)
        batch_R.extend(discounted_rewards)
        
        new_reward = np.sum(batch_rewards)
        total_rewards.append(new_reward)
        
        batch_rewards.clear()
        batch_episodes += 1
        done_episodes += 1
        
        state = env.reset()
        
        mean_rewards = np.mean(total_rewards)
        if done_episodes % 100 == 0:
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d, epi_length: %d" % (
                        step_idx, new_reward, mean_rewards, done_episodes, episode_length))
        
        if mean_rewards > 80:
            print('Solved in %d steps and in %d eps'%(step_idx, done_episodes))
            break
        
        episode_length = 0
    else:
        state = new_state


    if batch_episodes < BATCH_SIZE:
        continue
    
    # done acquiring events, time to train
    batch_episodes = 0
    
    optimizer.zero_grad()
    batch_state_v = torch.FloatTensor(batch_states).to(device)
    batch_actions_v = torch.LongTensor(batch_actions).to(device)
    batch_R_v = torch.FloatTensor(batch_R).to(device)
    
    policy_logits_v, values_v = pi_logits(batch_state_v)
    
    loss_value_v = F.mse_loss(values_v.squeeze(-1), batch_R_v)
    
    log_prob_v = F.log_softmax(policy_logits_v, dim=1)
    entropy_loss_v = -ENTROPY_BETA*((F.softmax(policy_logits_v, dim=1)*log_prob_v).sum(dim=1)).mean()
    
    log_prob_v = (batch_R_v-values_v.detach())*log_prob_v[range(len(batch_state_v)), batch_actions_v]
    loss_v = -(log_prob_v.mean())
    
    loss_v = loss_v-entropy_loss_v + loss_value_v #push entropy towards max uncertianty for all states as a threshold
    loss_v.backward()
    nn_utils.clip_grad_norm_(pi_logits.parameters(), CLIP_GRAD)
    optimizer.step()
    
    batch_states.clear()
    batch_actions.clear()
    batch_R.clear()

8962: reward: -566.26, mean_100: -157.48, episodes: 100, epi_length: 85
18693: reward: -56.27, mean_100: -188.67, episodes: 200, epi_length: 66
30268: reward: -113.19, mean_100: -147.16, episodes: 300, epi_length: 84
40066: reward: -96.22, mean_100: -107.86, episodes: 400, epi_length: 76
53227: reward: -100.81, mean_100: -72.34, episodes: 500, epi_length: 130
68507: reward: -228.60, mean_100: -80.84, episodes: 600, epi_length: 122


KeyboardInterrupt: 

In [107]:
env = gym.make('LunarLander-v2')
state = env.reset()
for _ in range(800):
    state_v = torch.FloatTensor([state])

    logits_v,_ = pi_logits(state_v)
    prob_a = F.softmax(logits_v, dim=1)
    prob_a = prob_a.data.cpu().numpy()

    a = np.random.choice(env.action_space.n, p=prob_a[0])
    env.render()
    state,r,d,_ = env.step(a)
    
    if not d and np.sqrt(state[1]*state[1]+ state[0]*state[0]) < 0.1:
        d = True
    if d:
        break

env.close()

In [108]:
state

array([ 0.09706602, -0.01125275, -0.6507874 ,  0.07096584,  0.07540077,
       -0.22275634,  0.        ,  0.        ], dtype=float32)