In [None]:
!pip install gym[atari]
!pip install autorom[accept-rom-license]
!pip install highway-env

In [1]:
import highway_env
import gymnasium as gym
import sys
import pickle
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import namedtuple
import numpy as np
from collections import deque
from torch.autograd import Variable
import torch.nn.functional as F
import tqdm
import os
import cv2
import torch.distributions as distributions
from torch.distributions import Normal
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical
import pickle

In [2]:
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []
        self.is_terminals = []


    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.state_values[:]
        del self.is_terminals[:]


In [3]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init=0.99):
        super(ActorCritic, self).__init__()

        self.action_dim = action_dim
        self.action_var = torch.full((action_dim,), action_std_init * action_std_init)

        # actor

        self.actor = nn.Sequential(
                        nn.Linear(state_dim, 512),
                        nn.Tanh(),
                        nn.Linear(512, 256),
                        nn.Tanh(),
                        nn.Linear(256, action_dim),
                        nn.Tanh()
                        )


        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 512),
                        nn.Tanh(),
                        nn.Linear(512, 256),
                        nn.Tanh(),
                        nn.Linear(256, 1)
                    )

    def set_action_std(self, new_action_std):
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std)


    def forward(self):
        raise NotImplementedError


    def act(self, state):


        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        state_val = self.critic(state)

        return action.detach(), action_logprob.detach(), state_val.detach()


    def evaluate(self, state, action):

        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var)
        dist = MultivariateNormal(action_mean, cov_mat)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy


In [4]:

class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.99):


        self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std_init)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, action_std_init)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()


    def set_action_std(self, new_action_std):

        self.action_std = new_action_std
        self.policy.set_action_std(new_action_std)
        self.policy_old.set_action_std(new_action_std)



    def decay_action_std(self, action_std_decay_rate, min_action_std):


            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std

            self.set_action_std(self.action_std)



    def select_action(self, state):


            with torch.no_grad():
                state = torch.FloatTensor(state)
                action, action_logprob, state_val = self.policy_old.act(state)

            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            self.buffer.state_values.append(state_val)

            return action.detach().cpu().numpy().flatten()




    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach()
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach()
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach()
        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach()

        # calculate advantages
        advantages = rewards.detach() - old_state_values.detach()
        total_loss=0

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)

            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            total_loss+=loss

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
        return total_loss

In [None]:

action_std = None

rewards_list=[]
rendered_frames=[]
losses_list=[]

K_epochs = 40               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network


env = gym.make("parking-v0", render_mode="rgb_array")
env.unwrapped.config['add_walls']=False
env.unwrapped.config['duration']=40

# state space dimension
state_dim = 6
action_dim = env.action_space.shape[0]


# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip)

lengths_list=[]
rewards_list=[]
losses_list=[]
rendered_frames=[]
time_step = 0
i_episode = 0

max_episodes=5000
pbar = tqdm.trange(max_episodes)
loss= 0.0
flag=0

for episode in pbar:
    state,_ = env.reset(seed=0)
    state=state['observation']
    current_ep_reward = 0
    step=0

    # for t in range(1, max_ep_len+1):
    for _ in range(10000):
        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, terminated ,_ = env.step(action)
        state=state['observation']
        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        time_step +=1
        current_ep_reward += reward

        step+=1
        time_step +=1
        # update PPO agent
        if time_step % 1000 == 0:
            loss =ppo_agent.update()
            flag=1

        # if continuous action space; then decay action std of ouput action distribution
        if time_step % 1000 == 0:
            ppo_agent.decay_action_std(0.98, 0.1)

        if episode % 499==0:
          rendered_frames.append(env.render())

        if done or terminated:
            break
    # loss=ppo_agent.update()
    if flag==1:
      if type(loss) != float:
        loss=loss.clone().detach().numpy().sum()
        flag=0
    losses_list.append(loss)
    lengths_list.append(step)
    rewards_list.append(current_ep_reward)
    pbar.set_description(
                      f'Episode: {episode} | Steps: {step + 1} | Return: {current_ep_reward:5.2f} |Loss: {loss}| STD: { ppo_agent.action_std} '
            )


In [6]:
def moving_average(data, *, window_size = 50):
    """Smooths 1-D data array using a moving average.

    Args:
        data: 1-D numpy.array
        window_size: Size of the smoothing window

    Returns:
        smooth_data: A 1-d numpy.array with the same size as data
    """
    # assert data.ndim == 1
    kernel = np.ones(window_size)
    smooth_data = np.convolve(data, kernel) / np.convolve(
        np.ones_like(data), kernel
    )
    return smooth_data[: -window_size + 1]

In [8]:
file_path_PPO_returns = "PPO_Returns.pkl"
file_path_PPO_losses="PPO_Losses.pkl"
file_path_PPO_lengths="PPO_Lengths.pkl"
file_path_PPO_frames="PPO_frames.pkl"

In [9]:
with open(file_path_PPO_returns, 'wb') as f:
    pickle.dump(rewards_list, f)
with open(file_path_PPO_losses, 'wb') as f:
    pickle.dump(losses_list, f)
with open(file_path_PPO_frames, 'wb') as f:
    pickle.dump(rendered_frames, f)
with open(file_path_PPO_lengths, 'wb') as f:
    pickle.dump(lengths_list, f)

In [10]:
with open(file_path_PPO_returns, 'rb') as f :
  PPO_returns = pickle.load(f)
with open(file_path_PPO_losses, 'rb') as f :
  PPO_losses = pickle.load(f)
with open(file_path_PPO_frames, 'rb') as f :
  PPO_frames = pickle.load(f)
with open(file_path_PPO_lengths, 'rb') as f :
  PPO_lengths = pickle.load(f)

In [None]:
# YOUR PLOTTING CODE HERE
plt.figure(figsize=(10, 6))
# plt.subplot(3, 1, 1)
plt.plot(PPO_returns, label='Returns (Raw Data)', alpha=0.5)
plt.plot(moving_average(PPO_returns), label='Returns (Moving Average)', color='orange')
plt.title('Returns')
plt.xlabel('Episode')
plt.ylabel('Return')
plt.legend()
plt.show()
plt.close()

plt.figure(figsize=(10, 6))
plt.plot(PPO_lengths, label='Lengths (Raw Data)', alpha=0.5)
plt.plot(moving_average(PPO_lengths), label='Lengths (Moving Average)', color='orange')
plt.title('Lengths')
plt.xlabel('Episode')
plt.ylabel('Length')
plt.legend()
plt.show()
plt.close()

plt.figure(figsize=(10, 6))
plt.plot(PPO_losses, label='Losses (Raw Data)')
plt.plot(moving_average(PPO_losses), label='Losses (Moving Average)', color='orange')
plt.title('Losses')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.legend()
plt.show()