In [17]:
import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

import copy
import keyboard
from time import sleep
from matplotlib import pyplot as plt
from easydict import EasyDict as edict

device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
class ActorCritic(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.actor_Net= nn.Sequential(nn.Conv2d(3, 5, 5, stride=1, bias=True), #(92x92x5)
                                      nn.ReLU(),
                                      nn.MaxPool2d(2),  #(46x46x5)
                                      nn.Conv2d(5, 10, 5, stride=1, bias=True), #(in_ch, out_ch, kernel_size) #(42x42x10)
                                      nn.ReLU(),
                                      nn.MaxPool2d(3), #(kernel_size) #(14,14,10)
                                      nn.Flatten(start_dim=1), #by def only flatens dim staring from 1
                                      nn.Linear(1960, 500),
                                      nn.ReLU(),
                                      nn.Linear(500, 6)) #(input, output)
        
        self.critic_Net= nn.Sequential(nn.Conv2d(3, 5, 5, stride=1, bias=True), 
                                       nn.ReLU(),
                                       nn.MaxPool2d(2),
                                       nn.Conv2d(5, 10, 5, stride=1, bias=True),
                                       nn.ReLU(),
                                       nn.MaxPool2d(3), 
                                       nn.Flatten(start_dim=1),
                                       nn.Linear(1960, 500),
                                       nn.ReLU(),
                                       nn.Linear(500, 1))
        
    def data_prep(self, obs):  #Transposing and normalising data
        #Conversion
        if isinstance(obs, np.ndarray):
            obs= torch.from_numpy(obs).to(device).float() #uint8 -> float, numpy -> torch, to(gpu)

        if obs.dim() == 3:  # Handle single observation
            obs = obs.unsqueeze(0)
        obs = obs.permute(0, 3, 1, 2)    # h*w*#ch -> #ch*h*w
        
        #Normalize
        obs= obs/255.0
        
        return obs
        
    def actor_forward(self, obs):     #Forward pass of actor network
        obs= self.data_prep(obs)
        out= self.actor_Net(obs)
        return out

    def critic_forward(self, obs):   #Forward pass of critic network
        obs = self.data_prep(obs)
        out = self.critic_Net(obs)
        return out

In [8]:
class RolloutBuffer():
    def __init__(self, buffer_size, obs_space, act_space, device, n_env):
        self.buffer_size = buffer_size
        self.n_env= n_env
        self.device= device
        self.obs_space= obs_space
        self.act_space= act_space
        
        self.obs = np.empty((0, *obs_space), dtype=np.float32)
        self.next_obs = np.empty((0, *obs_space), dtype=np.float32)
        self.act = np.empty((0, act_space), dtype=np.float32)
        self.log_prob= np.empty((0, act_space), dtype=np.float32)
        self.rew = np.empty(0, dtype=np.float32)
        self.dones = np.empty(0, dtype= bool)

    def to_numpy(self, data):
        if isinstance(data, torch.Tensor):
            return data.cpu().numpy()
        return data 

    def add(self, obs, next_obs, act, log_prob, rew, dones):
        #Convert to Numpy and send to CPU
        obs = self.to_numpy(obs)
        next_obs = self.to_numpy(next_obs)
        act = self.to_numpy(act)
        log_prob= self.to_numpy(log_prob)
        rew = self.to_numpy(rew)
        done = self.to_numpy(dones)
        
        self.obs= np.concatenate((self.obs, obs), axis=0)
        self.next_obs= np.concatenate((self.next_obs, next_obs), axis=0)
        self.act= np.concatenate((self.act, act), axis=0)
        self.log_prob= np.concatenate((self.log_prob, log_prob), axis=0)
        self.rew= np.concatenate((self.rew, rew), axis=0)
        self.dones= np.concatenate((self.dones, dones), axis=0)

        if len(self.obs)>= self.buffer_size:
           self.obs= self.obs[-self.buffer_size:]
           self.next_obs= self.next_obs[-self.buffer_size:]
           self.act= self.act[-self.buffer_size:]
           self.log_prob= self.log_prob[-self.buffer_size:] 
           self.rew= self.rew[-self.buffer_size:]
           self.dones= self.dones[-self.buffer_size:]
        
    def show(self):        
        print("Observations: ", self.obs)
        print("Next observations: ", self.next_obs)
        print("Actions: ", self.act)
        print("Log Probabilities: ", self.log_prob)
        print("Rewards: ", self.rew)
        print("Dones: ", self.dones)

    def flush(self):
        self.obs = np.empty((0, *self.obs_space), dtype=np.float32)
        self.next_obs = np.empty((0, *self.obs_space), dtype=np.float32)
        self.act = np.empty((0, self.act_space), dtype=np.float32)
        self.log_prob= np.empty((0, self.act_space), dtype=np.float32)
        self.rew = np.empty(0, dtype=np.float32)
        self.dones = np.empty(0, dtype= bool)        

    def get(self):
        data= edict
        data.obs = torch.from_numpy(self.obs).to(self.device)
        data.next_obs= torch.from_numpy(self.next_obs).to(self.device)
        data.act= torch.from_numpy(self.act).to(torch.float32).to(self.device)
        data.log_prob= torch.from_numpy(self.log_prob).to(torch.float32).to(self.device)
        data.rew= torch.from_numpy(self.rew).to(torch.float32).to(self.device)
        data.dones= torch.from_numpy(self.dones).to(self.device)
        return data

In [26]:
#Sample action
def action_sampler(AC, obs, num_env):
    param = AC.actor_forward(obs)
    mean = param[:, :3] 
    log_std = param[:, 3:]
    std = torch.exp(log_std)

    normal_dist= Normal(mean, std)
    act= normal_dist.sample()    
    log_prob= normal_dist.log_prob(act)

    act = act.detach().cpu().numpy()
    log_prob= log_prob.detach().cpu().numpy()
   
    return act, log_prob

In [10]:
def actor_critic_loss(data, AC):
    with torch.no_grad():
        target = torch.tensor(0.0, device=device)
        for r in reversed(data.rew):
            target = hyper.gamma* target + r 
        target = target + (hyper.gamma**len(data.rew))*AC.critic_forward(data.next_obs[-1])*(1-data.dones.int()[-1]) 
    value = AC.critic_forward(data.obs[0])
    loss_critic= F.mse_loss(target, value)

    adv= target - value
    loss_actor= - adv * data.log_prob[0].sum()
    return loss_critic, loss_actor   

In [27]:
#Hyperparameters
hyper= edict()

hyper.batch_size = 10000 #Size of mini batch
hyper.gamma =0.90 #Discount
hyper.actor_lr = 0.0001 # Actor Learning Rate
hyper.critic_lr= 0.0001 # Critic learning Rate
hyper.n_step= 5

hyper.total_time = 10 #Total time steps
hyper.buffer = 10 #Size of replay buffer
hyper.train_freq= 10 #Frequency of training after replay buffer filled

hyper.log= 1000 #Frequency of return logging
hyper.num_envs=1 #Number of parallel environments

In [28]:
env= gym.make_vec("CarRacing-v3", render_mode="human",num_envs=hyper.num_envs,  lap_complete_percent= 0.95, domain_randomize=False, vectorization_mode="async")
env._max_episode_steps = 4000

AC= ActorCritic(env).to(device)
obs, _= env.reset(options={"randomize": False})
rb= RolloutBuffer(hyper.buffer, env.single_observation_space.shape, np.squeeze(env.single_action_space.shape), device, hyper.num_envs)
ret= 0 
done= False

optimizer_1 = torch.optim.Adam(AC.actor_Net.parameters(), lr=hyper.actor_lr)
optimizer_2 = torch.optim.Adam(AC.critic_Net.parameters(), lr=hyper.critic_lr)

for i in range(hyper.total_time):
    act, log_prob= action_sampler(AC, obs, hyper.num_envs)
    next_obs, reward, done, trunc, info = env.step(act)
    rb.add(obs, next_obs, act, log_prob, reward, done)
    obs=next_obs 
    
    if (i+1)%hyper.n_step==0 or done== True:
        data= rb.get()
        #Actor-Critic loss
        c_loss, a_loss= actor_critic_loss(data, AC)

        #Actor optimize
        optimizer_1.zero_grad()
        a_loss.backward(retain_graph= True)
        optimizer_1.step()

        #Critic optimize
        optimizer_2.zero_grad()
        c_loss.backward()
        optimizer_2.step()
         
        rb.flush()  #clear the n-step buffer
    ret = ret+ reward
    if done==True or trunc== True:
        print("Episode return:", ret)
        print("Critic loss:", c_loss)
        print("Actor loss:", a_loss)
        ret=0
    # env.render()  
env.close()

In [None]:
print(data.)