<a href="https://colab.research.google.com/github/ArshT/Reinforcement_Learning_Basic/blob/master/PPO_self.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install box2d-py
import gym
env = gym.make("LunarLander-v2")



In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np

class Actor(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Actor, self).__init__()

    self.fc1_action = nn.Linear(input_dims,fc1_dims)
    self.fc2_action = nn.Linear(fc1_dims,fc2_dims)
    self.action_layer = nn.Linear(fc2_dims,n_actions) 

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    x = F.relu(self.fc1_action(state))
    x = F.relu(self.fc2_action(x))
    actions = self.action_layer(x)
    
    return actions

class Critic(nn.Module):

  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Critic, self).__init__()

    self.fc1_value = nn.Linear(input_dims,fc1_dims)
    self.fc2_value = nn.Linear(fc1_dims,fc2_dims)
    self.value_layer = nn.Linear(fc2_dims,1)

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation

    y = F.relu(self.fc1_value(state))
    y = F.relu(self.fc2_value(y))
    state_values = self.value_layer(y)

    return state_values

class ActorCritic(nn.Module):

  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(ActorCritic,self).__init__()

    self.actor = Actor(input_dims,fc1_dims,fc2_dims,n_actions,device)
    self.critic = Critic(input_dims,fc1_dims,fc2_dims,n_actions,device)

    self.device = device
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    actions = self.actor(state)
    state_values = self.critic(state)

    return actions,state_values

class PPO:

  def __init__(self,gamma,critic_alpha,actor_alpha,betas,k_epochs,eps_clip,
               input_dims,fc1_dims,fc2_dims,n_actions,device):
    self.GAMMA = gamma
    self.ACTOR_ALPHA = actor_alpha
    self.CRITIC_ALPHA = critic_alpha
    self.BETAS = betas
    self.K_epochs = k_epochs
    self.EPS_CLIP = eps_clip
    self.device = device

    self.policy = ActorCritic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.policy_old = ActorCritic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.policy_old.load_state_dict(self.policy.state_dict())

    self.critic_loss = nn.MSELoss()
    self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': self.ACTOR_ALPHA},
                        {'params': self.policy.critic.parameters(), 'lr': self.CRITIC_ALPHA}
                    ])

    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def clear_memory(self):
    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def act(self,observation):
    probabilities,_ = self.policy_old.forward(observation)
    probabilities = F.softmax(probabilities)
    action_probs = torch.distributions.Categorical(probabilities)
    action = action_probs.sample()
    log_prob = action_probs.log_prob(action)
    
    state = torch.Tensor(observation).float().to(self.device)
    self.state_memory.append(state)
    self.action_memory.append(action)
    self.logprob_memory.append(log_prob)

    return action.item()
  
  def store_transitions(self,reward,done):
    self.reward_memory.append(reward)
    self.terminal_memory.append(done)
  
  def evaluate(self,actions,states):
    probabilities,state_values = self.policy.forward(states)
    probabilities = F.softmax(probabilities)

    action_probs = torch.distributions.Categorical(probabilities)
    action_logprobs = action_probs.log_prob(actions)
    dist_entropy = action_probs.entropy()

    return action_logprobs,torch.squeeze(state_values),dist_entropy
  
  def update(self,i_episode):
    
    rewards = []
    discounted_reward = 0
    for reward,terminal in zip(reversed(self.reward_memory),reversed(self.terminal_memory)):
      if terminal:
        discounted_reward = 0
      discounted_reward = reward + (self.GAMMA * discounted_reward)
      rewards.insert(0,discounted_reward)
    
    rewards = torch.tensor(rewards,dtype=torch.float32).to(self.device)
    rewards = (rewards - rewards.mean())/(rewards.std() + 1e-5)

    old_actions = torch.stack(self.action_memory).to(self.device)
    old_states = torch.stack(self.state_memory).to(self.device)
    old_logprobs = torch.stack(self.logprob_memory).to(self.device)

    for k in range(self.K_epochs):

      new_logprobs,state_values,dist_entropy = self.evaluate(old_actions,old_states)
      ratios = torch.exp(new_logprobs - old_logprobs.detach())
      advantages = rewards - state_values.detach()

      surr1 = ratios*advantages
      surr2 = torch.clamp(ratios,1-self.EPS_CLIP,1+self.EPS_CLIP)*advantages

      loss = -torch.min(surr1,surr2) - (0.05/i_episode)*dist_entropy + 0.5*self.critic_loss(state_values,rewards)
      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()
    
    self.policy_old.load_state_dict(self.policy.state_dict())
    self.clear_memory()

In [None]:
import gym

def train_PPO(env_name,critic_lr,actor_lr,solved_reward,log_interval,update_timestep,max_episodes,fc1_dims,fc2_dims,betas,gamma,K_epochs,eps_clip,device):

    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False

    ppo = PPO(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,critic_alpha=critic_lr,actor_alpha=actor_lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1500)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            reward =  reward
            score += reward
            ppo.reward_memory.append(reward)
            ppo.terminal_memory.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)



In [None]:
env_name = "LunarLander-v2"
solved_reward = 199       # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 5000        # max training episodes
update_timestep = 3000      # update policy every n timesteps
critic_lr = 0.001
actor_lr = 0.0003
fc1_dims = 256	
fc2_dims = 256
betas = (0.9, 0.999)
gamma = 0.99                # discount factor
K_epochs = 30                # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
random_seed = None
device = 'cuda'

train_PPO(env_name=env_name,critic_lr=critic_lr,actor_lr=actor_lr,solved_reward=solved_reward,
          log_interval=log_interval,update_timestep=update_timestep,max_episodes=max_episodes,
          fc1_dims=fc1_dims,fc2_dims=fc2_dims,betas=betas,gamma=gamma,K_epochs=K_epochs,eps_clip=eps_clip,device=device)



Episode 1 : -156.97682895836178 Max Timesteps: 500
Episode 2 : -78.49896205222382 Max Timesteps: 500
Episode 3 : -337.6575249701582 Max Timesteps: 500
Episode 4 : -198.32561868193363 Max Timesteps: 500
Episode 5 : -137.59762889021385 Max Timesteps: 500
Episode 6 : -382.79032076291594 Max Timesteps: 500
Episode 7 : -391.30721238630014 Max Timesteps: 500
Episode 8 : -285.95264751856587 Max Timesteps: 500
Episode 9 : -311.68990197550954 Max Timesteps: 500
Episode 10 : -99.7288228942237 Max Timesteps: 500
Episode 11 : -146.87420502380235 Max Timesteps: 500
Episode 12 : -491.26755522102826 Max Timesteps: 500
Episode 13 : -259.32777370247493 Max Timesteps: 500
Episode 14 : -357.42396599642586 Max Timesteps: 500
Episode 15 : -445.5489269913943 Max Timesteps: 500
Episode 16 : -152.86826436646413 Max Timesteps: 500
Episode 17 : -102.69425609101017 Max Timesteps: 500
Episode 18 : -54.55365229192883 Max Timesteps: 500
Episode 19 : -143.58085510382796 Max Timesteps: 500
Episode 20 	 avg length: 93



Episode 33 : -151.32011379690923 Max Timesteps: 500
Episode 34 : -80.76989957547462 Max Timesteps: 500
Episode 35 : -159.3157978990423 Max Timesteps: 500
Episode 36 : -96.98543788532822 Max Timesteps: 500
Episode 37 : -98.1749121693795 Max Timesteps: 500
Episode 38 : -130.4896291727511 Max Timesteps: 500
Episode 39 : -79.73557471740031 Max Timesteps: 500
Episode 40 	 avg length: 87 	 reward: -160
Episode 41 : -194.59161452227286 Max Timesteps: 500
Episode 42 : -167.14342618522517 Max Timesteps: 500
Episode 43 : -188.16534604242344 Max Timesteps: 500
Episode 44 : -74.0449020459402 Max Timesteps: 500
Episode 45 : -92.9812110475645 Max Timesteps: 500
Episode 46 : -424.0060089424926 Max Timesteps: 500
Episode 47 : -123.5899414282647 Max Timesteps: 500
Episode 48 : -149.23937113275122 Max Timesteps: 500
Episode 49 : -362.8396534849212 Max Timesteps: 500
Episode 50 : -374.0583986423971 Max Timesteps: 500
Episode 51 : -306.88877203008445 Max Timesteps: 500
Episode 52 : -262.29614913948234 Max

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np

class ActorCritic(nn.Module):
  
  
  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(ActorCritic, self).__init__()

    self.fc1_action = nn.Linear(input_dims,fc1_dims)
    self.fc2_action = nn.Linear(fc1_dims,fc2_dims)
    self.action_layer = nn.Linear(fc2_dims,n_actions)
    
    self.fc1_value = nn.Linear(input_dims,fc1_dims)
    self.fc2_value = nn.Linear(fc1_dims,fc2_dims)
    self.value_layer = nn.Linear(fc2_dims,1)

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    x = F.relu(self.fc1_action(state))
    x = F.relu(self.fc2_action(x))
    actions = self.action_layer(x)

    y = F.relu(self.fc1_value(state))
    y = F.relu(self.fc2_value(y))
    state_values = self.value_layer(y)

    return actions,state_values


class PPO:

  def __init__(self,gamma,alpha,betas,k_epochs,eps_clip,
               input_dims,fc1_dims,fc2_dims,n_actions,device):
    self.GAMMA = gamma
    self.ALPHA = alpha
    self.BETAS = betas
    self.K_epochs = k_epochs
    self.EPS_CLIP = eps_clip
    self.device = device

    self.policy = ActorCritic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.critic_loss = nn.MSELoss()
    self.optimizer = Optim.Adam(self.policy.parameters(),lr=self.ALPHA,betas=self.BETAS)

    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def clear_memory(self):
    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def act(self,observation):
    probabilities,_ = self.policy.forward(observation)
    probabilities = F.softmax(probabilities)
    action_probs = torch.distributions.Categorical(probabilities)
    action = action_probs.sample()
    log_prob = action_probs.log_prob(action)
    
    state = torch.Tensor(observation).float().to(self.device)
    self.state_memory.append(state)
    self.action_memory.append(action)
    self.logprob_memory.append(log_prob)

    return action.item()
  
  def store_transitions(self,reward,done):
    self.reward_memory.append(reward)
    self.terminal_memory.append(done)
  
  def evaluate(self,actions,states):
    probabilities,state_values = self.policy.forward(states)
    probabilities = F.softmax(probabilities)
    action_probs = torch.distributions.Categorical(probabilities)
    action_logprobs = action_probs.log_prob(actions)
    dist_entropy = action_probs.entropy()

    return action_logprobs,torch.squeeze(state_values),dist_entropy
  
  def update(self,i_episode):
    
    rewards = []
    discounted_reward = 0
    for reward,terminal in zip(reversed(self.reward_memory),reversed(self.terminal_memory)):
      if terminal:
        discounted_reward = 0
      discounted_reward = reward + (self.GAMMA * discounted_reward)
      rewards.insert(0,discounted_reward)
    
    rewards = torch.tensor(rewards,dtype=torch.float32).to(self.device)
    rewards = (rewards - rewards.mean())/(rewards.std() + 1e-5)

    old_actions = torch.stack(self.action_memory).to(self.device)
    old_states = torch.stack(self.state_memory).to(self.device)
    old_logprobs = torch.stack(self.logprob_memory).to(self.device)

    for k in range(self.K_epochs):

      new_logprobs,state_values,dist_entropy = self.evaluate(old_actions,old_states)
      ratios = torch.exp(new_logprobs - old_logprobs.detach())
      advantages = rewards - state_values.detach()

      surr1 = ratios*advantages
      surr2 = torch.clamp(ratios,1-self.EPS_CLIP,1+self.EPS_CLIP)*advantages

      loss = -torch.min(surr1,surr2) + 0.5*self.critic_loss(state_values,rewards) - (0.05/i_episode)*dist_entropy

      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()
    
    self.clear_memory()

In [None]:
import gym

def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False
    solved_reward = 199       # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 5000        # max training episodes
    update_timestep = 5000      # update policy every n timesteps
    lr = 0.002
    fc1_dims = 128	
    fc2_dims = 64   
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 5                # update policy for K epochs
    eps_clip = 0.1              # clip parameter for PPO
    random_seed = None
    device = 'cuda'
    #############################################
    

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    ppo = PPO(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,alpha=lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1500)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            score += reward
            ppo.reward_memory.append(reward)
            ppo.terminal_memory.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)

if __name__ == '__main__':
    main()



Episode 1 : -1083.7469697950664 Max Timesteps: 500
Episode 2 : -877.6788621957446 Max Timesteps: 500
Episode 3 : -442.34100693156563 Max Timesteps: 500
Episode 4 : -499.9495570848839 Max Timesteps: 500
Episode 5 : -814.4237870809235 Max Timesteps: 500
Episode 6 : -716.0070176757886 Max Timesteps: 500
Episode 7 : -740.1186771674313 Max Timesteps: 500
Episode 8 : -833.4105208611567 Max Timesteps: 500
Episode 9 : -606.83981657134 Max Timesteps: 500
Episode 10 : -586.7658105736955 Max Timesteps: 500
Episode 11 : -659.5257259310982 Max Timesteps: 500
Episode 12 : -889.0981180265932 Max Timesteps: 500
Episode 13 : -512.8073120745248 Max Timesteps: 500
Episode 14 : -431.32995848081094 Max Timesteps: 500
Episode 15 : -691.3832039625628 Max Timesteps: 500
Episode 16 : -768.3152976774085 Max Timesteps: 500
Episode 17 : -546.1541440759983 Max Timesteps: 500
Episode 18 : -711.1744269121098 Max Timesteps: 500
Episode 19 : -900.0848859489973 Max Timesteps: 500
Episode 20 	 avg length: 93 	 reward: -



Episode 54 : -408.01511052598283 Max Timesteps: 500
Episode 55 : -717.8379452860144 Max Timesteps: 500
Episode 56 : -559.2661916987549 Max Timesteps: 500
Episode 57 : -420.3418071078043 Max Timesteps: 500
Episode 58 : -488.8791455307749 Max Timesteps: 500
Episode 59 : -770.1183852285304 Max Timesteps: 500
Episode 60 	 avg length: 91 	 reward: -634
Episode 61 : -526.0533218441535 Max Timesteps: 500
Episode 62 : -462.8975047575366 Max Timesteps: 500
Episode 63 : -542.7625709029704 Max Timesteps: 500
Episode 64 : -451.719024304149 Max Timesteps: 500
Episode 65 : -752.2801567577683 Max Timesteps: 500
Episode 66 : -894.6559895934162 Max Timesteps: 500
Episode 67 : -364.9801553039592 Max Timesteps: 500
Episode 68 : -512.8153864356648 Max Timesteps: 500
Episode 69 : -625.4134214486949 Max Timesteps: 500
Episode 70 : -466.8146741734364 Max Timesteps: 500
Episode 71 : -592.90790337431 Max Timesteps: 500
Episode 72 : -831.477909613782 Max Timesteps: 500
Episode 73 : -544.9015905638721 Max Timest

KeyboardInterrupt: ignored

In [None]:
import gym

def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False
    solved_reward = 199       # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 5000        # max training episodes
    update_timestep = 5000      # update policy every n timesteps
    lr = 0.002
    fc1_dims = 128	
    fc2_dims = 128   
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 5                # update policy for K epochs
    eps_clip = 0.1              # clip parameter for PPO
    random_seed = None
    device = 'cuda'
    #############################################
    

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    ppo = PPO(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,alpha=lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1500)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            reward =  reward
            score += reward
            ppo.reward_memory.append(reward)
            ppo.terminal_memory.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)

if __name__ == '__main__':
    main()



Episode 1 : -144.06366156707085 Max Timesteps: 500
Episode 2 : -140.6890345331618 Max Timesteps: 500
Episode 3 : -188.0549166598245 Max Timesteps: 500
Episode 4 : -230.459023816539 Max Timesteps: 500
Episode 5 : -130.06683976259785 Max Timesteps: 500
Episode 6 : -99.29099221381064 Max Timesteps: 500
Episode 7 : -115.25543568391001 Max Timesteps: 500
Episode 8 : -183.70532942750413 Max Timesteps: 500
Episode 9 : -11.233787112460305 Max Timesteps: 500
Episode 10 : -278.44550331479445 Max Timesteps: 500
Episode 11 : -81.04830687751954 Max Timesteps: 500
Episode 12 : -401.98810919178464 Max Timesteps: 500
Episode 13 : -313.0626648842938 Max Timesteps: 500
Episode 14 : -130.14463834676766 Max Timesteps: 500
Episode 15 : -176.95127690568313 Max Timesteps: 500
Episode 16 : -285.8623088774971 Max Timesteps: 500
Episode 17 : -96.99122831127521 Max Timesteps: 500
Episode 18 : -134.50820416753044 Max Timesteps: 500
Episode 19 : -64.80206758082946 Max Timesteps: 500
Episode 20 	 avg length: 86 	 r



Episode 58 : -415.84634403026945 Max Timesteps: 500
Episode 59 : -133.21883061403452 Max Timesteps: 500
Episode 60 	 avg length: 86 	 reward: -154
Episode 61 : -71.68693911462469 Max Timesteps: 500
Episode 62 : -139.15521828198217 Max Timesteps: 500
Episode 63 : -230.9692889703149 Max Timesteps: 500
Episode 64 : -94.77270870451373 Max Timesteps: 500
Episode 65 : -95.35179916419035 Max Timesteps: 500
Episode 66 : 15.127816824016406 Max Timesteps: 500
Episode 67 : -118.44792498033414 Max Timesteps: 500
Episode 68 : -350.2240081652222 Max Timesteps: 500
Episode 69 : -159.72201100744823 Max Timesteps: 500
Episode 70 : -87.35323697921604 Max Timesteps: 500
Episode 71 : -140.15555506053784 Max Timesteps: 500
Episode 72 : -85.15408752912275 Max Timesteps: 500
Episode 73 : -105.79253630731954 Max Timesteps: 500
Episode 74 : -262.6904163305966 Max Timesteps: 500
Episode 75 : -130.6397652793474 Max Timesteps: 500
Episode 76 : -142.40210556305215 Max Timesteps: 500
Episode 77 : -149.1675986500403

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np

class Actor(nn.Module):
  
  
  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Actor, self).__init__()

    self.fc1_action = nn.Linear(input_dims,fc1_dims)
    self.fc2_action = nn.Linear(fc1_dims,fc2_dims)
    self.action_layer = nn.Linear(fc2_dims,n_actions) 

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    x = F.relu(self.fc1_action(state))
    x = F.relu(self.fc2_action(x))
    actions = self.action_layer(x)
    
    return actions

class Critic(nn.Module):

  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Critic, self).__init__()

    self.fc1_value = nn.Linear(input_dims,fc1_dims)
    self.fc2_value = nn.Linear(fc1_dims,fc2_dims)
    self.value_layer = nn.Linear(fc2_dims,1)

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation

    y = F.relu(self.fc1_value(state))
    y = F.relu(self.fc2_value(y))
    state_values = self.value_layer(y)

    return state_values


class PPO:

  def __init__(self,gamma,alpha,betas,k_epochs,eps_clip,
               input_dims,fc1_dims,fc2_dims,n_actions,device):
    self.GAMMA = gamma
    self.ALPHA = alpha
    self.BETAS = betas
    self.K_epochs = k_epochs
    self.EPS_CLIP = eps_clip
    self.device = device

    self.actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.critic = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.critic_loss = nn.MSELoss()
    self.actor_optimizer = Optim.Adam(self.actor.parameters(),lr=self.ALPHA,betas=self.BETAS)
    self.critic_optimizer = Optim.Adam(self.critic.parameters(),lr=self.ALPHA,betas=self.BETAS)

    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def clear_memory(self):
    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
  
  def act(self,observation):
    probabilities = self.actor.forward(observation)
    probabilities = F.softmax(probabilities)
    action_probs = torch.distributions.Categorical(probabilities)
    action = action_probs.sample()
    log_prob = action_probs.log_prob(action)
    
    state = torch.Tensor(observation).float().to(self.device)
    self.state_memory.append(state)
    self.action_memory.append(action)
    self.logprob_memory.append(log_prob)

    return action.item()
  
  def store_transitions(self,reward,done):
    self.reward_memory.append(reward)
    self.terminal_memory.append(done)
  
  def evaluate(self,actions,states):
    probabilities = self.actor.forward(states)
    probabilities = F.softmax(probabilities)
    state_values = self.critic.forward(states)

    action_probs = torch.distributions.Categorical(probabilities)
    action_logprobs = action_probs.log_prob(actions)
    dist_entropy = action_probs.entropy()

    return action_logprobs,torch.squeeze(state_values),dist_entropy
  
  def update(self,i_episode):
    
    rewards = []
    discounted_reward = 0
    for reward,terminal in zip(reversed(self.reward_memory),reversed(self.terminal_memory)):
      if terminal:
        discounted_reward = 0
      discounted_reward = reward + (self.GAMMA * discounted_reward)
      rewards.insert(0,discounted_reward)
    
    rewards = torch.tensor(rewards,dtype=torch.float32).to(self.device)
    rewards = (rewards - rewards.mean())/(rewards.std() + 1e-5)

    old_actions = torch.stack(self.action_memory).to(self.device)
    old_states = torch.stack(self.state_memory).to(self.device)
    old_logprobs = torch.stack(self.logprob_memory).to(self.device)

    for k in range(self.K_epochs):

      new_logprobs,state_values,dist_entropy = self.evaluate(old_actions,old_states)
      ratios = torch.exp(new_logprobs - old_logprobs.detach())
      advantages = rewards - state_values.detach()

      surr1 = ratios*advantages
      surr2 = torch.clamp(ratios,1-self.EPS_CLIP,1+self.EPS_CLIP)*advantages

      critic_loss =  0.5*self.critic_loss(state_values,rewards)
      self.critic_optimizer.zero_grad()
      critic_loss.mean().backward()
      self.critic_optimizer.step()


      actor_loss = -torch.min(surr1,surr2) - (0.05/i_episode)*dist_entropy
      self.actor_optimizer.zero_grad()
      actor_loss.mean().backward()
      self.actor_optimizer.step()
    
    self.clear_memory()

In [None]:
import gym

def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False
    solved_reward = 199       # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 5000        # max training episodes
    update_timestep = 5000      # update policy every n timesteps
    lr = 0.002
    fc1_dims = 256	
    fc2_dims = 128   
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 5                # update policy for K epochs
    eps_clip = 0.1              # clip parameter for PPO
    random_seed = None
    device = 'cuda'
    #############################################
    

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    ppo = PPO(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,alpha=lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1500)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            reward =  reward
            score += reward
            ppo.reward_memory.append(reward)
            ppo.terminal_memory.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)

if __name__ == '__main__':
    main()



Episode 1 : -96.11733205917167 Max Timesteps: 500
Episode 2 : -156.5377932248221 Max Timesteps: 500
Episode 3 : -399.0239743866964 Max Timesteps: 500
Episode 4 : -357.2626337810163 Max Timesteps: 500
Episode 5 : -104.20200997189612 Max Timesteps: 500
Episode 6 : -106.83905099293577 Max Timesteps: 500
Episode 7 : -152.20261489453208 Max Timesteps: 500
Episode 8 : -76.70729247344815 Max Timesteps: 500
Episode 9 : -59.23335861809017 Max Timesteps: 500
Episode 10 : -327.83598417405904 Max Timesteps: 500
Episode 11 : -321.1087350921258 Max Timesteps: 500
Episode 12 : -237.25300321655257 Max Timesteps: 500
Episode 13 : -467.83874646928695 Max Timesteps: 500
Episode 14 : -325.6099786273179 Max Timesteps: 500
Episode 15 : -113.26527440733928 Max Timesteps: 500
Episode 16 : -474.6407986208361 Max Timesteps: 500
Episode 17 : -204.1151439966444 Max Timesteps: 500
Episode 18 : -339.74930742274523 Max Timesteps: 500
Episode 19 : -103.34308413018026 Max Timesteps: 500
Episode 20 	 avg length: 90 	 r



Episode 54 : -202.91234969217965 Max Timesteps: 500
Episode 55 : -179.35716898334903 Max Timesteps: 500
Episode 56 : -308.1034898718925 Max Timesteps: 500
Episode 57 : -148.45192700910565 Max Timesteps: 500
Episode 58 : -195.47921765132003 Max Timesteps: 500
Episode 59 : -87.81763515420171 Max Timesteps: 500
Episode 60 	 avg length: 93 	 reward: -163
Episode 61 : -1.6684053474862992 Max Timesteps: 500
Episode 62 : -239.38960363403686 Max Timesteps: 500
Episode 63 : -83.66888208698111 Max Timesteps: 500
Episode 64 : -170.0343833053883 Max Timesteps: 500
Episode 65 : -128.27836176719813 Max Timesteps: 500
Episode 66 : -394.5236483717201 Max Timesteps: 500
Episode 67 : -280.6549473289266 Max Timesteps: 500
Episode 68 : -72.44341697794668 Max Timesteps: 500
Episode 69 : -420.6346960540973 Max Timesteps: 500
Episode 70 : -24.21992089605874 Max Timesteps: 500
Episode 71 : -418.14466677524103 Max Timesteps: 500
Episode 72 : -130.70687516143815 Max Timesteps: 500
Episode 73 : -351.779898091377

In [None]:
import gym

def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False
    solved_reward = 199       # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 5000        # max training episodes
    update_timestep = 5000      # update policy every n timesteps
    critic_lr = 0.003
    actor_lr = 0.001
    fc1_dims = 256	
    fc2_dims = 512
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 5                # update policy for K epochs
    eps_clip = 0.1              # clip parameter for PPO
    random_seed = None
    device = 'cpu'
    #############################################
    

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    ppo = PPO(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,critic_alpha=critic_lr,actor_alpha=actor_lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1500)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            reward =  reward
            score += reward
            ppo.reward_memory.append(reward)
            ppo.terminal_memory.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)

if __name__ == '__main__':
    main()



Episode 1 : -26.557171517703296 Max Timesteps: 500
Episode 2 : -80.98583140270853 Max Timesteps: 500
Episode 3 : -100.47896859559515 Max Timesteps: 500
Episode 4 : -201.38482492810056 Max Timesteps: 500
Episode 5 : -127.49654381211792 Max Timesteps: 500
Episode 6 : -118.21852557511832 Max Timesteps: 500
Episode 7 : -95.3213926461125 Max Timesteps: 500
Episode 8 : -98.08258586194378 Max Timesteps: 500
Episode 9 : -70.58432449770982 Max Timesteps: 500
Episode 10 : -262.04598256375334 Max Timesteps: 500
Episode 11 : -70.93404758885995 Max Timesteps: 500
Episode 12 : -103.54983514121935 Max Timesteps: 500
Episode 13 : -130.17778425222033 Max Timesteps: 500
Episode 14 : -382.65858437719953 Max Timesteps: 500
Episode 15 : -371.4245300151057 Max Timesteps: 500
Episode 16 : -102.9779233993337 Max Timesteps: 500
Episode 17 : -141.2284510641067 Max Timesteps: 500
Episode 18 : -130.57836728808633 Max Timesteps: 500
Episode 19 : -322.654142964451 Max Timesteps: 500
Episode 20 	 avg length: 90 	 re



Episode 57 : -112.26106007207977 Max Timesteps: 500
Episode 58 : -157.85855097787731 Max Timesteps: 500
Episode 59 : -129.0044114146125 Max Timesteps: 500
Episode 60 	 avg length: 85 	 reward: -168
Episode 61 : -143.0154405962912 Max Timesteps: 500
Episode 62 : -200.5053036513749 Max Timesteps: 500
Episode 63 : -134.67953886958824 Max Timesteps: 500
Episode 64 : -30.636682815334495 Max Timesteps: 500
Episode 65 : -263.90932260031735 Max Timesteps: 500
Episode 66 : -80.68857664661512 Max Timesteps: 500
Episode 67 : -124.8098839252765 Max Timesteps: 500
Episode 68 : -80.78072947531746 Max Timesteps: 500
Episode 69 : -86.18057131453806 Max Timesteps: 500
Episode 70 : 5.021869941040919 Max Timesteps: 500
Episode 71 : -194.2729423287534 Max Timesteps: 500
Episode 72 : -138.23694462896367 Max Timesteps: 500
Episode 73 : -328.05616387902217 Max Timesteps: 500
Episode 74 : -125.33676572610429 Max Timesteps: 500
Episode 75 : -292.06411928762606 Max Timesteps: 500
Episode 76 : -107.7870241070801

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np

class Actor(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Actor, self).__init__()

    self.fc1_action = nn.Linear(input_dims,fc1_dims)
    self.fc2_action = nn.Linear(fc1_dims,fc2_dims)
    self.action_layer = nn.Linear(fc2_dims,n_actions) 

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    x = F.relu(self.fc1_action(state))
    x = F.relu(self.fc2_action(x))
    actions = self.action_layer(x)
    
    return actions

class Critic(nn.Module):

  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(Critic, self).__init__()

    self.fc1_value = nn.Linear(input_dims,fc1_dims)
    self.fc2_value = nn.Linear(fc1_dims,fc2_dims)
    self.value_layer = nn.Linear(fc2_dims,1)

    self.device = torch.device(device)
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation

    y = F.relu(self.fc1_value(state))
    y = F.relu(self.fc2_value(y))
    state_values = self.value_layer(y)

    return state_values

class ActorCritic(nn.Module):

  def __init__(self,input_dims,fc1_dims,fc2_dims,n_actions,device):
    super(ActorCritic,self).__init__()

    self.actor = Actor(input_dims,fc1_dims,fc2_dims,n_actions,device)
    self.critic = Critic(input_dims,fc1_dims,fc2_dims,n_actions,device)

    self.device = device
    self.to(self.device)
  
  def forward(self,observation):
    try:
      state = torch.Tensor(observation).float().to(self.device)
    except:
      state = observation
    
    actions = self.actor(state)
    state_values = self.critic(state)

    return actions,state_values

class PPO_changed:

  def __init__(self,gamma,critic_alpha,actor_alpha,betas,k_epochs,eps_clip,
               input_dims,fc1_dims,fc2_dims,n_actions,device):
    self.GAMMA = gamma
    self.ACTOR_ALPHA = actor_alpha
    self.CRITIC_ALPHA = critic_alpha
    self.BETAS = betas
    self.K_epochs = k_epochs
    self.EPS_CLIP = eps_clip
    self.device = device

    self.policy = ActorCritic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.policy_old = ActorCritic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,n_actions=n_actions,device=device)
    self.policy_old.load_state_dict(self.policy.state_dict())

    self.critic_loss = nn.MSELoss()
    self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': self.ACTOR_ALPHA},
                        {'params': self.policy.critic.parameters(), 'lr': self.CRITIC_ALPHA}
                    ])

    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
    self.next_state_memory = []
  
  def clear_memory(self):
    self.state_memory = []
    self.action_memory = []
    self.logprob_memory = []
    self.reward_memory = []
    self.terminal_memory = []
    self.next_state_memory =  []
  
  def act(self,observation):
    probabilities,_ = self.policy_old.forward(observation)
    probabilities = F.softmax(probabilities)
    action_probs = torch.distributions.Categorical(probabilities)
    action = action_probs.sample()
    log_prob = action_probs.log_prob(action)
    
    state = torch.Tensor(observation).float().to(self.device)
    self.state_memory.append(state)
    self.action_memory.append(action)
    self.logprob_memory.append(log_prob)

    return action.item()
  
  def store_transitions(self,next_state,reward,done):
    self.next_state_memory.append(next_state)
    self.reward_memory.append(torch.Tensor([reward]))
    self.terminal_memory.append(int(done))
  
  def evaluate(self,actions,states):
    probabilities,state_values = self.policy.forward(states)
    probabilities = F.softmax(probabilities)

    action_probs = torch.distributions.Categorical(probabilities)
    action_logprobs = action_probs.log_prob(actions)
    dist_entropy = action_probs.entropy()

    return action_logprobs,torch.squeeze(state_values),dist_entropy
  
  def update(self,i_episode):
    
    reward_batch = torch.stack(self.reward_memory).to(self.device)
    old_actions = torch.stack(self.action_memory).to(self.device)
    old_states = torch.stack(self.state_memory).to(self.device)
    old_logprobs = torch.stack(self.logprob_memory).to(self.device)
    next_states = torch.stack(self.next_state_memory).to(self.device)
    terminals = torch.stack(self.terminal_memory).to(self.device)

    est_reward = reward_batch + self.GAMMA * (1 - terminals) * self.policy_old.critic(next_states).detach()

    for k in range(self.K_epochs):

      new_logprobs,state_values,dist_entropy = self.evaluate(old_actions,old_states)
      ratios = torch.exp(new_logprobs - old_logprobs.detach())
      advantages = est_reward - state_values.detach()
      advantages = (advantages - advantages.mean())/(advantages.std() + 1e-5)

      surr1 = ratios*advantages
      surr2 = torch.clamp(ratios,1-self.EPS_CLIP,1+self.EPS_CLIP)*advantages

      loss = -torch.min(surr1,surr2) - (0.05/i_episode)*dist_entropy + 0.5*self.critic_loss(state_values,est_reward)
      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()
    
    self.policy_old.load_state_dict(self.policy.state_dict())
    self.clear_memory()

In [None]:
import gym

def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    render = False
    solved_reward = 199       # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 5000        # max training episodes
    update_timestep = 5000      # update policy every n timesteps
    critic_lr = 0.0003
    actor_lr = 0.0003
    fc1_dims = 256	
    fc2_dims = 512
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 5                # update policy for K epochs
    eps_clip = 0.1              # clip parameter for PPO
    random_seed = None
    device = 'cpu'
    #############################################
    

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    ppo = PPO_changed(input_dims=state_dim, n_actions = action_dim,fc1_dims=fc1_dims,fc2_dims=fc2_dims,critic_alpha=critic_lr,actor_alpha=actor_lr,betas=betas,gamma=gamma,k_epochs=K_epochs,eps_clip=eps_clip,device=device)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        max_timesteps = max(500,int(500*(i_episode/1750)))

        for t in range(max_timesteps):
            timestep += 1
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            score += reward
            ppo.reward_memory.append(torch.tensor([reward],dtype=torch.float32))
            ppo.terminal_memory.append(torch.tensor([int(done)],dtype=torch.float32))
            next_state = torch.Tensor(state).float().to(device)
            ppo.next_state_memory.append(next_state)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(i_episode)
                ppo.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            
                
            running_reward = 0
            avg_length = 0
        else:
            print("Episode",i_episode,":",score,"Max Timesteps:",max_timesteps)

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
    
    avg_score = 0
    for i in range(50):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = ppo.act(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            score += reward
        print(score)
        avg_score += score
    
    print()
    print(avg_score / 50)

if __name__ == '__main__':
    main()



Episode 1 : -188.35393279732614 Max Timesteps: 500
Episode 2 : -189.22530746781104 Max Timesteps: 500
Episode 3 : -399.0323872080756 Max Timesteps: 500
Episode 4 : -203.3144598302797 Max Timesteps: 500
Episode 5 : -89.23776642365804 Max Timesteps: 500
Episode 6 : -196.4963069112091 Max Timesteps: 500
Episode 7 : -59.01040848339267 Max Timesteps: 500
Episode 8 : -53.116409228378835 Max Timesteps: 500
Episode 9 : -115.93877747113174 Max Timesteps: 500
Episode 10 : -156.45098577141783 Max Timesteps: 500
Episode 11 : -26.896213374262146 Max Timesteps: 500
Episode 12 : -393.80125284124824 Max Timesteps: 500
Episode 13 : -162.4979843861023 Max Timesteps: 500
Episode 14 : -111.5321102103802 Max Timesteps: 500
Episode 15 : -196.53332941683382 Max Timesteps: 500
Episode 16 : -88.4823178135164 Max Timesteps: 500
Episode 17 : -131.96086225773703 Max Timesteps: 500
Episode 18 : -323.9571839601474 Max Timesteps: 500
Episode 19 : -368.32248454247184 Max Timesteps: 500
Episode 20 	 avg length: 88 	 r

  return F.mse_loss(input, target, reduction=self.reduction)


Episode 56 : -286.6414656519228 Max Timesteps: 500
Episode 57 : -171.44870686299714 Max Timesteps: 500
Episode 58 : -66.78786416152073 Max Timesteps: 500
Episode 59 : -101.56737906714982 Max Timesteps: 500
Episode 60 	 avg length: 91 	 reward: -168
Episode 61 : -98.66263964534824 Max Timesteps: 500
Episode 62 : -152.21366183303442 Max Timesteps: 500
Episode 63 : -228.87731018162162 Max Timesteps: 500
Episode 64 : -77.03630616246777 Max Timesteps: 500
Episode 65 : -268.1417041001147 Max Timesteps: 500
Episode 66 : -393.77156679019936 Max Timesteps: 500
Episode 67 : -72.15953753984905 Max Timesteps: 500
Episode 68 : -148.44562411462888 Max Timesteps: 500
Episode 69 : -108.76981076966632 Max Timesteps: 500
Episode 70 : -87.8712197594692 Max Timesteps: 500
Episode 71 : -233.49602511678884 Max Timesteps: 500
Episode 72 : -130.76616999172603 Max Timesteps: 500
Episode 73 : -480.61968660740445 Max Timesteps: 500
Episode 74 : -124.85353867697555 Max Timesteps: 500
Episode 75 : -64.550407682428

KeyboardInterrupt: ignored