In [1]:
!pip3 install box2d-py
import gym
env = gym.make("LunarLander-v2")

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 404 kB/s eta 0:00:01
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np
from torch.distributions import MultivariateNormal

class Actor(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Actor,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    f3 = 0.003
    self.mu = nn.Linear(fc2_dims,action_dims)
    torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state):

    x = F.relu(self.bn1(self.fc1(state)))
    x = F.relu(self.bn2(self.fc2(x)))

    action_mu = torch.tanh(self.mu(x))

    return action_mu


class Critic(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Critic,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    self.action_value_layer = nn.Linear(action_dims,fc2_dims)

    f3 = 0.003
    self.q = nn.Linear(fc2_dims,1)
    torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.q.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state,action):

    state_value = F.relu(self.bn1(self.fc1(state)))
    state_value = self.bn2(self.fc2(state_value))

    action_value = F.relu(self.action_value_layer(action))

    state_action_value = F.relu(torch.add(state_value,action_value))
    state_action_value = self.q(state_action_value)

    return state_action_value
  

class Agent:
  def __init__(self,alpha,beta,input_dims,fc1_dims,fc2_dims,action_dims,tau,env,action_std_decay_rate,device,
                 min_action_std, gamma=0.99,max_size=1000000,batch_size=64,action_std_init=0.6):
    
    self.action_dims = action_dims
    self.gamma = gamma
    self.tau = tau
    self.env = env
    self.batch_size = batch_size
    self.device = device

    self.action_dims=action_dims
    self.action_std_decay_rate = action_std_decay_rate
    self.min_action_std = min_action_std

    self.actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_critic = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic_optimizer = Optim.Adam(self.critic.parameters(),lr=beta)
    self.actor_optimizer = Optim.Adam(self.actor.parameters(),lr=alpha)

    self.action_std = action_std_init
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(device)

    self.mem_size = max_size
    self.mem_cntr = 0
    self.state_memory = np.zeros((self.mem_size, input_dims))
    self.new_state_memory = np.zeros((self.mem_size, input_dims))
    self.action_memory = np.zeros((self.mem_size,action_dims))
    self.reward_memory = np.zeros((self.mem_size,1))
    self.terminal_memory = np.zeros((self.mem_size,1), dtype=np.float32)

    self.update_network_parameters(tau=1)

  
  def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)


  
  
  def set_action_std(self, new_action_std):
    self.action_std = new_action_std
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(self.device)

  
  
  def decay_action_std(self):
    print("--------------------------------------------------------------------------------------------")
    
    self.action_std = self.action_std - self.action_std_decay_rate
    self.action_std = round(self.action_std, 4)
    if (self.action_std <= self.min_action_std):
      self.action_std = self.min_action_std
      print("setting actor output action_std to min_action_std : ", self.action_std)
      self.set_action_std(self.action_std)
    else:
      print("setting actor output action_std to : ", self.action_std)
      self.set_action_std(self.action_std)
    
    print("--------------------------------------------------------------------------------------------")

  
  def remember(self,state,action,reward,new_state,done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.new_state_memory[index] = new_state
    self.terminal_memory[index] = float(1- done)
    self.mem_cntr += 1
  
  
  def choose_action(self,observation):
    self.actor.eval()
    with torch.no_grad():
      state = torch.FloatTensor(observation).to(self.actor.device)
      action_mean = self.actor.forward(state).to(self.actor.device)
      cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
      dist = MultivariateNormal(action_mean, cov_mat)
      
    action = dist.sample()
    low = self.env.action_space.low[0]
    high = self.env.action_space.high[0]
    action = torch.clamp(action,low,high)
    
    self.actor.train()
      
    return action.detach().cpu().numpy().flatten()
  

  def learn(self):

    if self.mem_cntr < self.batch_size:
      return
    else:
      max_mem = min(self.mem_cntr, self.mem_size)

      batch = np.random.choice(max_mem, self.batch_size)

      states = self.state_memory[batch]
      actions = self.action_memory[batch]
      rewards = self.reward_memory[batch]
      next_states = self.new_state_memory[batch]
      terminals = self.terminal_memory[batch]

      rewards = torch.tensor(rewards, dtype=torch.float).to(self.critic.device)
      terminals = torch.tensor(terminals).to(self.critic.device)
      next_states = torch.tensor(next_states, dtype=torch.float).to(self.critic.device)
      actions = torch.tensor(actions, dtype=torch.float).to(self.critic.device)
      states = torch.tensor(states, dtype=torch.float).to(self.critic.device)

      self.target_actor.eval()
      self.target_critic.eval()
      self.critic.eval()

      next_actions = self.target_actor.forward(next_states)
      next_critic_value = self.target_critic.forward(next_states,next_actions)
      critic_value = self.critic.forward(states,actions)

      targets = rewards + self.gamma*next_critic_value*terminals
      targets = torch.tensor(targets).to(self.critic.device)
      targets = targets.view(self.batch_size, 1)

      self.critic.train()
      self.critic_optimizer.zero_grad()
      critic_loss = F.mse_loss(targets, critic_value)
      critic_loss.backward()
      self.critic_optimizer.step()

      self.critic.eval()
      mu = self.actor.forward(states)
      self.actor.train()
      actor_loss = -self.critic.forward(states,mu)
      self.actor_optimizer.zero_grad()
      actor_loss.mean().backward()
      self.actor_optimizer.step()

      self.update_network_parameters()

In [3]:

import gym
import numpy as np

env = gym.make('BipedalWalker-v3')
agent = Agent(alpha=0.000001, beta=0.000005, input_dims=24, tau=0.001, env=env,action_std_decay_rate=0.05,gamma=0.99,
                 min_action_std=0.05,batch_size=256, fc1_dims=512, fc2_dims=256, action_dims=4,action_std_init=0.6,device='cuda')

action_std_decay_freq = int(1e5) 
#action_std_decay_freq = int(2000) 

score_history = []
timesteps = 0
for i in range(5000):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        timesteps += 1
        
        if timesteps % action_std_decay_freq == 0:
            agent.decay_action_std()
            
        if done:
            break
        #env.render()
    score_history.append(score)

    if i % 10 == 0:
        print('episode ', i, 'score %.2f' % score,
          'trailing 10 games avg %.3f' % np.mean(score_history[-10:]),
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]),
          'timesteps:',timesteps)
    else:
        print('episode ', i, 'score %.2f' % score,
          'timesteps:',timesteps)
      
    avg_100 = np.mean(score_history[-100:])
    if avg_100 >= 290:
      print("####SOLVED!!####")
      break


avg_score = 0
print("")
for i in range(50):
  obs = env.reset()
  done = False
  score = 0
  while not done:
    act = agent.choose_action(obs)
    new_state, reward, done, info = env.step(act)
    obs = new_state
    score += reward
  avg_score += score
  print("Episode Reward:",score)

print("")
print(avg_score/50)
  
    




episode  0 score -115.99 trailing 10 games avg -115.993 trailing 100 games avg -115.993 timesteps: 79




episode  1 score -66.64 timesteps: 1679
episode  2 score -113.76 timesteps: 1749
episode  3 score -114.33 timesteps: 1856
episode  4 score -113.92 timesteps: 1943
episode  5 score -114.51 timesteps: 2017
episode  6 score -111.43 timesteps: 2142
episode  7 score -113.72 timesteps: 2303
episode  8 score -119.02 timesteps: 2450
episode  9 score -114.91 timesteps: 2516
episode  10 score -111.55 trailing 10 games avg -109.378 trailing 100 games avg -109.980 timesteps: 2575
episode  11 score -106.30 timesteps: 2677
episode  12 score -68.34 timesteps: 4277
episode  13 score -109.17 timesteps: 4359
episode  14 score -109.38 timesteps: 4424
episode  15 score -63.14 timesteps: 6024
episode  16 score -109.86 timesteps: 6079
episode  17 score -112.78 timesteps: 6139
episode  18 score -63.04 timesteps: 7739
episode  19 score -65.75 timesteps: 9339
episode  20 score -58.89 trailing 10 games avg -86.665 trailing 100 games avg -98.877 timesteps: 10939
episode  21 score -52.92 timesteps: 12539
episode 