<a href="https://colab.research.google.com/github/ArshT/Reinforcement_Learning_Basic/blob/master/TD3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
import gym
env = gym.make("LunarLander-v2")

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 17.8MB/s eta 0:00:01[K     |█▌                              | 20kB 17.6MB/s eta 0:00:01[K     |██▏                             | 30kB 10.5MB/s eta 0:00:01[K     |███                             | 40kB 8.6MB/s eta 0:00:01[K     |███▋                            | 51kB 4.9MB/s eta 0:00:01[K     |████▍                           | 61kB 5.3MB/s eta 0:00:01[K     |█████▏                          | 71kB 5.7MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.1MB/s eta 0:00:01[K     |██████▋                         | 92kB 6.6MB/s eta 0:00:01[K     |███████▎                        | 102kB 5.2MB/s eta 0:00:01[K     |████████                        | 112kB 5.2MB/s eta 0:00:01[K     |████████▊                       | 

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np
from torch.distributions import MultivariateNormal

class Actor(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Actor,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    f3 = 0.003
    self.mu = nn.Linear(fc2_dims,action_dims)
    torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state):

    x = F.relu(self.bn1(self.fc1(state)))
    x = F.relu(self.bn2(self.fc2(x)))

    action_mu = torch.tanh(self.mu(x))

    return action_mu


class Critic(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Critic,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    self.action_value_layer = nn.Linear(action_dims,fc2_dims)

    f3 = 0.003
    self.q = nn.Linear(fc2_dims,1)
    torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.q.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state,action):

    state_value = F.relu(self.bn1(self.fc1(state)))
    state_value = self.bn2(self.fc2(state_value))

    action_value = F.relu(self.action_value_layer(action))

    state_action_value = F.relu(torch.add(state_value,action_value))
    state_action_value = self.q(state_action_value)

    return state_action_value
  

class Agent:
  def __init__(self,alpha,beta,input_dims,fc1_dims,fc2_dims,action_dims,tau,env,action_std_decay_rate,device,
                 min_action_std, gamma=0.99,max_size=1000000,batch_size=64,action_std_init=0.6,policy_delay=2,noise_clip=0.5):
    
    self.action_dims = action_dims
    self.gamma = gamma
    self.tau = tau
    self.env = env
    self.batch_size = batch_size
    self.device = device

    self.action_dims=action_dims
    self.action_std_decay_rate = action_std_decay_rate
    self.min_action_std = min_action_std

    self.actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic_1 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_critic_1 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    
    self.critic_2 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_critic_2 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic_optimizer_1 = Optim.Adam(self.critic_1.parameters(),lr=beta)
    self.critic_optimizer_2 = Optim.Adam(self.critic_2.parameters(),lr=beta)
    self.actor_optimizer = Optim.Adam(self.actor.parameters(),lr=alpha)

    self.action_std = action_std_init
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(device)

    self.policy_delay = policy_delay
    self.noise_clip = noise_clip

    self.mem_size = max_size
    self.mem_cntr = 0
    self.state_memory = np.zeros((self.mem_size, input_dims))
    self.new_state_memory = np.zeros((self.mem_size, input_dims))
    self.action_memory = np.zeros((self.mem_size,action_dims))
    self.reward_memory = np.zeros((self.mem_size,1))
    self.terminal_memory = np.zeros((self.mem_size,1), dtype=np.float32)

    self.update_network_parameters(tau=1)

  
  def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        critic_params_1 = self.critic_1.named_parameters()
        target_critic_params_1 = self.target_critic_1.named_parameters()
        critic_params_2 = self.critic_2.named_parameters()
        target_critic_params_2 = self.target_critic_2.named_parameters()


        actor_state_dict = dict(actor_params)
        target_actor_dict = dict(target_actor_params)
        critic_state_dict_1 = dict(critic_params_1)
        target_critic_dict_1 = dict(target_critic_params_1)
        critic_state_dict_2 = dict(critic_params_2)
        target_critic_dict_2 = dict(target_critic_params_2)

        for name in critic_state_dict_1:
            critic_state_dict_1[name] = tau*critic_state_dict_1[name].clone() + \
                                      (1-tau)*target_critic_dict_1[name].clone()

        self.target_critic_1.load_state_dict(critic_state_dict_1)


        for name in critic_state_dict_2:
            critic_state_dict_2[name] = tau*critic_state_dict_2[name].clone() + \
                                      (1-tau)*target_critic_dict_2[name].clone()

        self.target_critic_2.load_state_dict(critic_state_dict_2)


        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)


  
  
  def set_action_std(self, new_action_std):
    self.action_std = new_action_std
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(self.device)

  
  
  def decay_action_std(self):
    print("--------------------------------------------------------------------------------------------")
    
    self.action_std = self.action_std - self.action_std_decay_rate
    self.action_std = round(self.action_std, 4)
    if (self.action_std <= self.min_action_std):
      self.action_std = self.min_action_std
      print("setting actor output action_std to min_action_std : ", self.action_std)
      self.set_action_std(self.action_std)
    else:
      print("setting actor output action_std to : ", self.action_std)
      self.set_action_std(self.action_std)
    
    print("--------------------------------------------------------------------------------------------")

  
  def remember(self,state,action,reward,new_state,done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.new_state_memory[index] = new_state
    self.terminal_memory[index] = float(1- done)
    self.mem_cntr += 1
  
  
  def choose_action(self,observation):
    self.actor.eval()
    with torch.no_grad():
      state = torch.FloatTensor(observation).to(self.actor.device)
      action_mean = self.actor.forward(state).to(self.actor.device)
      cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
      dist = MultivariateNormal(action_mean, cov_mat)
      
    action = dist.sample()
    low = self.env.action_space.low[0]
    high = self.env.action_space.high[0]
    action = torch.clamp(action,low,high)
    
    self.actor.train()
      
    return action.detach().cpu().numpy().flatten()
  

  def learn(self,n_iter):

    for k in range(n_iter):
      if self.mem_cntr < self.batch_size:
        return
      else:
        max_mem = min(self.mem_cntr, self.mem_size)

      batch = np.random.choice(max_mem, self.batch_size)

      states = self.state_memory[batch]
      actions = self.action_memory[batch]
      rewards = self.reward_memory[batch]
      next_states = self.new_state_memory[batch]
      terminals = self.terminal_memory[batch]

      rewards = torch.tensor(rewards, dtype=torch.float).to(self.critic_1.device)
      terminals = torch.tensor(terminals).to(self.critic_1.device)
      next_states = torch.tensor(next_states, dtype=torch.float).to(self.critic_1.device)
      actions = torch.tensor(actions, dtype=torch.float).to(self.critic_1.device)
      states = torch.tensor(states, dtype=torch.float).to(self.critic_1.device)

      self.target_actor.eval()
      self.target_critic_1.eval()
      self.critic_1.eval()
      self.target_critic_2.eval()
      self.critic_2.eval()

      next_actions = self.target_actor.forward(next_states)
      noise =  torch.normal(0,self.action_std, size=actions.shape).to(self.device)
      noise = noise.clamp(-self.noise_clip, self.noise_clip)
      next_actions = (next_actions + noise)
      next_actions = torch.clamp(next_actions,self.env.action_space.low[0],self.env.action_space.high[0])

      next_critic_value_1 = self.target_critic_1.forward(next_states,next_actions)
      next_critic_value_2 = self.target_critic_2.forward(next_states,next_actions)
      next_critic_value = torch.min(next_critic_value_1,next_critic_value_2)
      critic_value_1 = self.critic_1.forward(states,actions)
      critic_value_2 = self.critic_2.forward(states,actions)

      targets = rewards + self.gamma*next_critic_value*terminals
      targets = torch.tensor(targets).to(self.critic_1.device)
      targets = targets.view(self.batch_size, 1)

      self.critic_1.train()
      self.critic_optimizer_1.zero_grad()
      critic_loss_1 = F.mse_loss(targets, critic_value_1)
      critic_loss_1.backward()
      self.critic_optimizer_1.step()

      self.critic_2.train()
      self.critic_optimizer_2.zero_grad()
      critic_loss_2 = F.mse_loss(targets, critic_value_2)
      critic_loss_2.backward()
      self.critic_optimizer_2.step()

      if k%self.policy_delay == 0:
        
        self.critic_1.eval()
        mu = self.actor.forward(states)
        self.actor.train()
        actor_loss = -self.critic_1.forward(states,mu)
        self.actor_optimizer.zero_grad()
        actor_loss.mean().backward()
        self.actor_optimizer.step()
        
        self.update_network_parameters()

In [3]:
import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')
agent = Agent(alpha=0.001, beta=0.001, input_dims=8, tau=0.001, env=env,action_std_decay_rate=0.05,gamma=0.99,
                 min_action_std=0.05,batch_size=128, fc1_dims=400, fc2_dims=300, action_dims=2,action_std_init=0.2,device='cuda')

action_std_decay_freq = int(7.5e4) 
#action_std_decay_freq = int(2000) 

score_history = []
timesteps = 0
for i in range(5000):
    obs = env.reset()
    done = False
    score = 0
    t = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        score += reward
        obs = new_state
        timesteps += 1
        
        if timesteps % action_std_decay_freq == 0:
           pass
           # agent.decay_action_std()
            
        if done:
            agent.learn(t)
            break
        #env.render()
        t += 1
    score_history.append(score)

    if i % 10 == 0:
        print('episode ', i, 'score %.2f' % score,
          'trailing 10 games avg %.3f' % np.mean(score_history[-10:]),
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]),
          'timesteps:',timesteps)
    else:
        print('episode ', i, 'score %.2f' % score,
          'timesteps:',timesteps)
      
    avg_100 = np.mean(score_history[-100:])
    if avg_100 >= 210:
      print("####SOLVED!!####")
      break


avg_score = 0
print("")
for i in range(50):
  obs = env.reset()
  done = False
  score = 0
  while not done:
    act = agent.choose_action(obs)
    new_state, reward, done, info = env.step(act)
    obs = new_state
    score += reward
  avg_score += score
  print("Episode Reward:",score)

print("")
print(avg_score/50)

episode  0 score -49.47 trailing 10 games avg -49.474 trailing 100 games avg -49.474 timesteps: 75




episode  1 score -120.41 timesteps: 216
episode  2 score -1052.98 timesteps: 319
episode  3 score -467.02 timesteps: 458
episode  4 score -290.15 timesteps: 602
episode  5 score -846.64 timesteps: 765
episode  6 score -256.29 timesteps: 865
episode  7 score -286.55 timesteps: 972
episode  8 score -190.86 timesteps: 1071
episode  9 score -186.74 timesteps: 1204
episode  10 score -251.95 trailing 10 games avg -394.960 trailing 100 games avg -363.552 timesteps: 1386
episode  11 score -90.29 timesteps: 1611
episode  12 score -158.88 timesteps: 2042
episode  13 score -164.38 timesteps: 2324
episode  14 score -156.36 timesteps: 2517
episode  15 score -225.59 timesteps: 2695
episode  16 score -224.52 timesteps: 2868
episode  17 score -266.45 timesteps: 2988
episode  18 score -635.91 timesteps: 3469
episode  19 score -146.29 timesteps: 3738
episode  20 score -372.58 trailing 10 games avg -244.125 trailing 100 games avg -306.682 timesteps: 3841
episode  21 score -442.53 timesteps: 3960
episode 