<a href="https://colab.research.google.com/github/ArshT/Reinforcement_Learning_Basic/blob/master/PPO_Continuous_Self.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install box2d-py
import gym
env = gym.make('BipedalWalker-v3')

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 10.8MB/s eta 0:00:01[K     |█▌                              | 20kB 15.5MB/s eta 0:00:01[K     |██▏                             | 30kB 12.6MB/s eta 0:00:01[K     |███                             | 40kB 9.8MB/s eta 0:00:01[K     |███▋                            | 51kB 5.4MB/s eta 0:00:01[K     |████▍                           | 61kB 6.1MB/s eta 0:00:01[K     |█████▏                          | 71kB 5.8MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.4MB/s eta 0:00:01[K     |██████▋                         | 92kB 6.4MB/s eta 0:00:01[K     |███████▎                        | 102kB 5.2MB/s eta 0:00:01[K     |████████                        | 112kB 5.2MB/s eta 0:00:01[K     |████████▊                       | 



In [None]:
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
import numpy as np
import gym
import torch.nn.functional as F

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")



class Actor(nn.Module):
    def __init__(self, state_dim, action_dim,fc1_dims,fc2_dims):
      super(Actor,self).__init__()

      self.fc1 = nn.Linear(state_dim,fc1_dims)
      self.fc2 = nn.Linear(fc1_dims,fc2_dims)
      self.action_layer = nn.Linear(fc2_dims,action_dim)
    
    def forward(self,state):

      x = F.relu(self.fc1(state))
      x = F.relu(self.fc2(x))
      action_mu = F.tanh(self.action_layer(x))

      return action_mu

class Critic(nn.Module):

    def __init__(self,state_dim,action_dims,fc1_dims,fc2_dims):
      super(Critic,self).__init__()

      self.fc1 = nn.Linear(state_dim,fc1_dims)
      self.fc2 = nn.Linear(fc1_dims,fc2_dims)
      self.value_layer = nn.Linear(fc2_dims,1)
    
    def forward(self,state):
      x = F.relu(self.fc1(state))
      x = F.relu(self.fc2(x))
      state_values = self.value_layer(x)

      return state_values


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim,fc1_dims,fc2_dims):
        super(ActorCritic, self).__init__()

        # actor
        self.actor = Actor( state_dim, action_dim,fc1_dims,fc2_dims)

        
        # critic
        self.critic = Critic(state_dim, action_dim,fc1_dims,fc2_dims)

    def forward(self):
        raise NotImplementedError


class PPO:
    def __init__(self, state_dim, action_dim,fc1_dims,fc2_dims, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env,action_std_decay_rate, min_action_std,action_std_init=0.6):


        self.action_std = action_std_init
        self.action_var = torch.full((action_dim,), self.action_std * self.action_std).to(device)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.env = env
        self.action_dim=action_dim
        self.action_std_decay_rate = action_std_decay_rate
        self.min_action_std = min_action_std
        self.action_dim = action_dim

        self.policy = ActorCritic(state_dim, action_dim,fc1_dims,fc2_dims).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim,fc1_dims,fc2_dims).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.critic_loss = nn.MSELoss()

        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


    def set_action_std(self, new_action_std):
      
      self.action_std = new_action_std
      self.action_var = torch.full((self.action_dim,), self.action_std * self.action_std).to(device)
        

    def decay_action_std(self):
        print("--------------------------------------------------------------------------------------------")
        
        self.action_std = self.action_std - self.action_std_decay_rate
        self.action_std = round(self.action_std, 4)
        if (self.action_std <= self.min_action_std):
          self.action_std = self.min_action_std
          print("setting actor output action_std to min_action_std : ", self.action_std)
          self.set_action_std(self.action_std)
        
        else:
          print("setting actor output action_std to : ", self.action_std)
          self.set_action_std(self.action_std)

        print("--------------------------------------------------------------------------------------------")
      
    
    def act(self, state):
      
      with torch.no_grad():
        state = torch.FloatTensor(state).to(device)
        action_mean = self.policy_old.actor(state)
        cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
        dist = MultivariateNormal(action_mean, cov_mat)
      
      action = dist.sample()
      low = self.env.action_space.low[0]
      high = self.env.action_space.high[0]
      action = torch.clamp(action,low,high)
      action_logprob = dist.log_prob(action)

      self.states.append(state)
      self.actions.append(action)
      self.logprobs.append(action_logprob)

      return action.detach().cpu().numpy().flatten()
    
    def evaluate(self, state, action):
      
      action_mean = self.policy.actor(state)
      action_var = self.action_var.expand_as(action_mean)
      cov_mat = torch.diag_embed(action_var).to(device)
      dist = MultivariateNormal(action_mean, cov_mat)
      
      if self.action_dim == 1:
        action = action.reshape(-1, self.action_dim)
      
      action_logprobs = dist.log_prob(action)
      dist_entropy = dist.entropy()
      state_values = self.policy.critic(state)
        
      return action_logprobs, state_values, dist_entropy


    def update(self):

        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.rewards), reversed(self.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        old_states = torch.squeeze(torch.stack(self.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.logprobs, dim=0)).detach().to(device)


        for _ in range(self.K_epochs):

            logprobs, state_values, dist_entropy = self.evaluate(old_states, old_actions)

            state_values = torch.squeeze(state_values)

            ratios = torch.exp(logprobs - old_logprobs.detach())

            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5*self.critic_loss(state_values, rewards) - 0.01*dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())
        self.clear()

Device set to : cpu


In [None]:
def train_PPO_Continuous(env_name,max_ep_len,max_training_episodes,print_ep,action_std,
                         action_std_decay_rate,min_action_std,action_std_decay_freq,update_timestep,
                         K_epochs,eps_clip,gamma,lr_actor,lr_critic,fc1_dims,fc2_dims,solved_reward):
  
  
  print("training environment name : " + env_name)
  
  env = gym.make(env_name)
  
  state_dim = env.observation_space.shape[0]
  
  action_dim = env.action_space.shape[0]
  
  
  print("--------------------------------------------------------------------------------------------")
  
  print("max training episodes : ", max_training_episodes)
  print("max timesteps per episode : ", max_ep_len)
  
  print("printing average reward over episodes in last : " + str(print_ep) + " episodes")
  
  print("--------------------------------------------------------------------------------------------")
  
  print("state space dimension : ", state_dim)
  print("action space dimension : ", action_dim)
  print("fc1 dimensions : ",fc1_dims)
  print("fc2 dimensions : ",fc2_dims)
  
  print("--------------------------------------------------------------------------------------------")
  
  print("Initializing a continuous action space policy")
  print("--------------------------------------------------------------------------------------------")
  print("starting std of action distribution : ", action_std)
  print("decay rate of std of action distribution : ", action_std_decay_rate)
  print("minimum std of action distribution : ", min_action_std)
  print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
  
  print("--------------------------------------------------------------------------------------------")
  
  print("PPO update frequency : " + str(update_timestep) + " timesteps") 
  print("PPO K epochs : ", K_epochs)
  print("PPO epsilon clip : ", eps_clip)
  print("discount factor (gamma) : ", gamma)
  
  print("--------------------------------------------------------------------------------------------")
  
  print("optimizer learning rate actor : ", lr_actor)
  print("optimizer learning rate critic : ", lr_critic)
  
  print("============================================================================================")
  
  ppo_agent = PPO(state_dim = state_dim,action_dim=action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,
                lr_actor=lr_actor,lr_critic=lr_critic,gamma=gamma,K_epochs=K_epochs,eps_clip=eps_clip,
                env = env,action_std_decay_rate=action_std_decay_rate,min_action_std=min_action_std,action_std_init=action_std)
  
  
  print_running_reward = 0
  print_running_episodes = 0

  running_100_reward = 0 
  
  time_step = 0
  i_episode = 0
  score_history = []
  
  while i_episode <= max_training_episodes:
    
    state = env.reset()
    current_ep_reward = 0

    for t in range(1, max_ep_len+1):
        
        action = ppo_agent.act(state)
        state, reward, done, _ = env.step(action)
        
        
        ppo_agent.rewards.append(reward)
        ppo_agent.is_terminals.append(done)
        
        time_step +=1
        current_ep_reward += reward

        
        if time_step % update_timestep == 0:
            ppo_agent.update()

        
        if time_step % action_std_decay_freq == 0:
            ppo_agent.decay_action_std()
            
        if done:
            break
    
    score_history.append(current_ep_reward)
    i_episode += 1
    avg_score_100 = np.mean(score_history[max(0, i_episode-100):(i_episode+1)])
    print("Episode : {} \t\t \t\t Episode Reward : {}".format(i_episode,current_ep_reward))

    if i_episode % print_ep == 0 and i_episode>0:      
      # print average reward till last episode
      print_avg_reward = print_running_reward / print_running_episodes
      print_avg_reward = round(print_avg_reward, 2)
      print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {} \t\t Avg_100 Reward : {}".format(i_episode, time_step, print_avg_reward,avg_score_100))

      print_running_reward = 0
      print_running_episodes = 0

    print_running_reward += current_ep_reward
    print_running_episodes += 1


    if avg_score_100 > solved_reward:
      ppo_agent.set_action_std(0.05)
      print("######SOLVED######")
      break


    

  
  avg_test_score = 0
  for i in range(50):
    
    state = env.reset()
    current_ep_reward = 0
    
    for t in range(1, max_ep_len+1):
        
        action = ppo_agent.act(state)
        state, reward, done, _ = env.step(action)
        
        time_step +=1
        current_ep_reward += reward
            
        if done:
            break
    
    
    print("Episode Reward : {}".format(current_ep_reward))
    avg_test_score += current_ep_reward
  
  print("")
  print("Avg Test Score:",avg_test_score/50)

In [None]:
env_name = "LunarLanderContinuous-v2"

has_continuous_action_space = True

max_ep_len = 500                   
max_training_episodes = 5000   

print_freq = max_ep_len * 4         

action_std = 0.6               
action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
min_action_std = 0.1           
action_std_decay_freq = int(7.5e4)  
solved_reward = 200


update_timestep = max_ep_len * 4      
K_epochs = 20               
eps_clip = 0.2              
gamma = 0.99                

lr_actor = 0.0003       
lr_critic = 0.001           

fc1_dims = 128
fc2_dims = 128

print_ep = 10


train_PPO_Continuous(env_name,max_ep_len,max_training_episodes,print_ep,action_std,
                         action_std_decay_rate,min_action_std,action_std_decay_freq,update_timestep,
                         K_epochs,eps_clip,gamma,lr_actor,lr_critic,fc1_dims,fc2_dims,solved_reward)

training environment name : LunarLanderContinuous-v2
--------------------------------------------------------------------------------------------
max training episodes :  5000
max timesteps per episode :  500
printing average reward over episodes in last : 10 episodes
--------------------------------------------------------------------------------------------
state space dimension :  8
action space dimension :  2
fc1 dimensions :  128
fc2 dimensions :  128
--------------------------------------------------------------------------------------------
Initializing a continuous action space policy
--------------------------------------------------------------------------------------------
starting std of action distribution :  0.6
decay rate of std of action distribution :  0.05
minimum std of action distribution :  0.1
decay frequency of std of action distribution : 75000 timesteps
--------------------------------------------------------------------------------------------
PPO update frequ



Episode : 2 		 		 Episode Reward : -307.52162838067494
Episode : 3 		 		 Episode Reward : -204.3997034789097
Episode : 4 		 		 Episode Reward : -571.4950184060957
Episode : 5 		 		 Episode Reward : -127.21199070230435
Episode : 6 		 		 Episode Reward : -329.4860624085935
Episode : 7 		 		 Episode Reward : -341.74444425259867
Episode : 8 		 		 Episode Reward : -457.8182863786199
Episode : 9 		 		 Episode Reward : -29.04081950569403
Episode : 10 		 		 Episode Reward : -282.98194379832387
Episode : 10 		 Timestep : 1200 		 Average Reward : -302.88 		 Avg_100 Reward : -300.8895080997662
Episode : 11 		 		 Episode Reward : -186.75289704722167
Episode : 12 		 		 Episode Reward : -270.0119980429336
Episode : 13 		 		 Episode Reward : -453.7252001513287
Episode : 14 		 		 Episode Reward : -173.29479263132635
Episode : 15 		 		 Episode Reward : -400.5409614225784
Episode : 16 		 		 Episode Reward : -29.97522982052648
Episode : 17 		 		 Episode Reward : -313.8627630337368
Episode : 18 		 		 Epis

In [None]:
env_name = "BipedalWalker-v3"

max_ep_len = 1500                   
max_training_episodes = 5000   
        

action_std = 0.6               
action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
min_action_std = 0.1           
action_std_decay_freq = int(2e5)  
solved_reward = 280


update_timestep = max_ep_len * 4      
K_epochs = 40               
eps_clip = 0.2              
gamma = 0.99                

lr_actor = 0.0003       
lr_critic = 0.001           

fc1_dims = 128
fc2_dims = 128

print_ep = 10


train_PPO_Continuous(env_name,max_ep_len,max_training_episodes,print_ep,action_std,
                         action_std_decay_rate,min_action_std,action_std_decay_freq,update_timestep,
                         K_epochs,eps_clip,gamma,lr_actor,lr_critic,fc1_dims,fc2_dims,solved_reward)

training environment name : BipedalWalker-v3
--------------------------------------------------------------------------------------------
max training episodes :  5000
max timesteps per episode :  1500
printing average reward over episodes in last : 10 episodes
--------------------------------------------------------------------------------------------
state space dimension :  24
action space dimension :  4
fc1 dimensions :  128
fc2 dimensions :  128
--------------------------------------------------------------------------------------------
Initializing a continuous action space policy
--------------------------------------------------------------------------------------------
starting std of action distribution :  0.6
decay rate of std of action distribution :  0.05
minimum std of action distribution :  0.1
decay frequency of std of action distribution : 200000 timesteps
--------------------------------------------------------------------------------------------
PPO update frequency 



Episode : 2 		 		 Episode Reward : -118.25299035959381
Episode : 3 		 		 Episode Reward : -100.09001227001846
Episode : 4 		 		 Episode Reward : -69.00829162777084
Episode : 5 		 		 Episode Reward : -103.35183716920702
Episode : 6 		 		 Episode Reward : -120.75358435661532
Episode : 7 		 		 Episode Reward : -78.48220640583766
Episode : 8 		 		 Episode Reward : -97.30671279072885
Episode : 9 		 		 Episode Reward : -72.74705046480152
Episode : 10 		 		 Episode Reward : -104.78987930895823
Episode : 10 		 Timestep : 5043 		 Average Reward : -95.19 		 Avg_100 Reward : -96.15007712010681
Episode : 11 		 		 Episode Reward : -71.13241102288939
Episode : 12 		 		 Episode Reward : -75.10108181166922
Episode : 13 		 		 Episode Reward : -67.8127106281246
Episode : 14 		 		 Episode Reward : -116.30955478223227
Episode : 15 		 		 Episode Reward : -62.769902215996176
Episode : 16 		 		 Episode Reward : -105.86666076224111
Episode : 17 		 		 Episode Reward : -98.05257083862772
Episode : 18 		 		 Epis