In [None]:
!pip install gym[atari]
!pip install autorom[accept-rom-license]
!pip install highway-env


In [4]:
import highway_env
import gymnasium as gym
import sys
import pickle
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import namedtuple
import numpy as np
from collections import deque
from torch.autograd import Variable
import torch.nn.functional as F
import tqdm
import os
import cv2


In [6]:
env = gym.make("parking-v0", render_mode="rgb_array")

In [7]:
env.reset(seed=0)
rendered_frames = []

# Perform 100 random steps in the environment and store rendered frames
for _ in range(100):
    action = env.action_space.sample()  # Sample a random action
    obs, reward, done, truncated, info = env.step(action)
    rendered_frames.append(env.render())

    if done or truncated==True:
      break
# Close the environment
env.close()

In [None]:
obs,_=env.reset(seed=0)

obs

In [9]:
# Actor Model
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(state_dim, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, action_dim)
        self.max_action = max_action

    def forward(self, state):
        x = F.relu(self.layer1(state))
        x = F.relu(self.layer2(x))
        x = self.max_action * torch.tanh(self.layer3(x))
        return x

# Critic Model
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(state_dim + action_dim, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x



In [10]:
# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)

    def __len__(self):
        return len(self.buffer)

In [11]:
# Ornstein-Uhlenbeck Noise for exploration
class OUNoise:
    def __init__(self, action_dimension, scale=0.1, mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.scale = scale
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(action_dimension) * mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def evolve_state(self):
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.randn(len(self.state))
        self.state = self.state + dx
        return self.state

    def get_action(self, action):
        return self.scale * self.evolve_state() + action

In [12]:
# DDPG Agent
class DDPGAgent:
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action)
        self.actor_target = Actor(state_dim, action_dim, max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters())

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters())

        self.replay_buffer = ReplayBuffer(capacity=100000)
        self.ounoise = OUNoise(action_dim)
        self.max_action = max_action

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1))
        action = self.actor(state).cpu().data.numpy().flatten()

        return self.ounoise.get_action(action)

    def train(self, batch_size=64):
        critic_loss_val=0
        actor_loss_val=0
        if len(self.replay_buffer) < batch_size:
            return 0,0

        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        reward = torch.FloatTensor(reward).unsqueeze(1)
        next_state = torch.FloatTensor(next_state)
        done = torch.FloatTensor(done).unsqueeze(1)

        # Compute the target Q value
        target_actions = self.actor_target(next_state)
        target_Q = self.critic_target(next_state, target_actions)
        target_Q = reward + (0.99 * target_Q * (1 - done))

        # Get current Q estimate
        current_Q = self.critic(state, action)


        # Compute critic loss
        critic_loss = F.mse_loss(current_Q, target_Q.detach())

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Compute actor loss
        actor_loss = -self.critic(state, self.actor(state)).mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(param.data * 0.005 + target_param.data * (1 - 0.005))

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(param.data * 0.005 + target_param.data * (1 - 0.005))

        return critic_loss.item(),actor_loss.item()

In [None]:
# Prepare the environment
env=gym.make("parking-v0", render_mode="rgb_array")


env.unwrapped.config['add_walls']=False
env.unwrapped.config['duration']=40
state_dim = 6
action_dim = 2
max_action = float(env.action_space.high[0])
rendered_frames=[]
agent = DDPGAgent(state_dim, action_dim, max_action)
num_steps = 5000
i_episode=0
rewards=[]
lengths=[]
losses=[]
pbar = tqdm.trange(num_steps)
for t_total in pbar:
    state,_ = env.reset(seed=0)
    state=state['observation']
    episode_reward = 0
    done = False
    truncated= False
    step=0
    critic_loss_sum=0
    actor_loss_sum=0
    training_steps=0

    for _ in range(1000):
        step+=1
        action = agent.select_action(np.array(state))
        next_state, reward, done, truncated ,info = env.step(action)
        next_state=next_state['observation']


        agent.replay_buffer.add(state, action, reward, next_state, done)

        critic_loss,actor_loss=agent.train(batch_size=64)
        critic_loss_sum += critic_loss
        actor_loss_sum += actor_loss
        if critic_loss and actor_loss:
            training_steps +=1

        state = next_state
        episode_reward += reward

        if t_total%499 == 0:
          rendered_frames.append(env.render())


        if done or truncated:
          avg_critic_loss =critic_loss_sum/training_steps if training_steps else 0
          avg_actor_loss=actor_loss_sum/training_steps if training_steps else 0
          total_loss=avg_critic_loss+avg_critic_loss
          pbar.set_description(
                      f'Episode: {i_episode} | Steps: {step + 1} | Return: {episode_reward:5.2f} |Loss: {total_loss} '
            )
          lengths.append(step+1)
          losses.append(total_loss)
          rewards.append(episode_reward)
          break
    i_episode+=1
env.close()


Episode: 146 | Steps: 201 | Return: -80.63 |Loss: 0.0817616552952677 :   3%|▎         | 147/5000 [10:04<5:21:48,  3.98s/it]  

In [None]:
def moving_average(data, *, window_size = 50):
    """Smooths 1-D data array using a moving average.

    Args:
        data: 1-D numpy.array
        window_size: Size of the smoothing window

    Returns:
        smooth_data: A 1-d numpy.array with the same size as data
    """
    kernel = np.ones(window_size)
    smooth_data = np.convolve(data, kernel) / np.convolve(
        np.ones_like(data), kernel
    )
    return smooth_data[: -window_size + 1]

In [None]:
file_path_DDPG_returns = "DDPG_Returns.pkl"
file_path_DDPG_losses="DDPG_Losses.pkl"
file_path_DDPG_frames="DDPG_frames.pkl"

In [None]:
with open(file_path_DDPG_returns, 'wb') as f:
    pickle.dump(rewards, f)
with open(file_path_DDPG_losses, 'wb') as f:
    pickle.dump(losses, f)
with open(file_path_DDPG_frames, 'wb') as f:
    pickle.dump(rendered_frames, f)

In [None]:
with open(file_path_DDPG_returns, 'rb') as f :
  DDPG_returns = pickle.load(f)
with open(file_path_DDPG_losses, 'rb') as f :
  DDPG_losses = pickle.load(f)
with open(file_path_DDPG_frames, 'rb') as f :
  DDPG_frames = pickle.load(f)

In [None]:
# YOUR PLOTTING CODE HERE
plt.figure(figsize=(10, 6))
# plt.subplot(3, 1, 1)
plt.plot(DDPG_returns, label='Returns (Raw Data)', alpha=0.5)
plt.plot(moving_average(DDPG_returns), label='Returns (Moving Average)', color='orange')
plt.title('Returns')
plt.xlabel('Episode')
plt.ylabel('Return')
plt.legend()
plt.show()
plt.close()

plt.figure(figsize=(10, 6))
plt.plot(lengths, label='Lengths (Raw Data)', alpha=0.5)
plt.plot(moving_average(lengths), label='Lengths (Moving Average)', color='orange')
plt.title('Lengths')
plt.xlabel('Episode')
plt.ylabel('Length')
plt.legend()
plt.show()
plt.close()

plt.figure(figsize=(10, 6))
plt.plot(DDPG_losses, label='Losses (Raw Data)')
plt.plot(moving_average(DDPG_losses), label='Losses (Moving Average)', color='orange')
plt.title('Losses')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.legend()
plt.show()