<a href="https://colab.research.google.com/github/AndyBaiMQC/rl-project-20/blob/master/Car_Racing_ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reinforcement Learning (COMP-767) Project

---

[CarRacing-v0](https://gym.openai.com/envs/CarRacing-v0/)

---



In [1]:
!nvidia-smi
!pip install box2d-py
!pip install gym[Box_2D]
!apt-get install x11-utils

# !apt-get -qq -y install libcusparse8.0 libnvrtc8.0 libnvtoolsext1 > /dev/null
# !ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so
# !apt-get -qq -y install xvfb freeglut3-dev ffmpeg> /dev/null
# !pip -q install pyopengl pyvirtualdisplay pyglet gym
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1

import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()
# https://stackoverflow.com/questions/50107530/how-to-render-openai-gym-in-google-colab

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+3build1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
import numpy as np
import gym
import tensorflow as tf
import tensorflow.nn as nn
from tensorflow.keras import Model
from tensorflow.keras.layers import Conv2D, Dense
import tensorflow.keras.backend as K
import tensorflow_probability as tfp
from tensorflow.keras import optimizers
from torch.distributions import Beta
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from tqdm.notebook import tqdm

In [0]:
tf.keras.backend.clear_session()  # For easy reset of notebook state.
tf.keras.backend.set_floatx('float64')

In [0]:
gamma = 0.99
action_repeat = 8
img_stack = 4
seed = 0
render = False
log_interval = 10
transition = np.dtype([('s', np.float64, (96, 96, img_stack)), ('a', np.float64, (3,)), ('a_logp', np.float64),
                       ('r', np.float64), ('s_', np.float64, (96, 96, img_stack))])

In [0]:
class Env():
    """
    Environment wrapper for CarRacing 
    """

    def __init__(self):
        self.env = gym.make('CarRacing-v0')
        self.env.seed(seed)
        self.reward_threshold = self.env.spec.reward_threshold

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()

        self.die = False
        img_rgb = self.env.reset()
        img_gray = self.rgb2gray(img_rgb)
        self.stack = [img_gray] * img_stack  # four frames for decision
        return np.array(self.stack)

    def step(self, action):
        total_reward = 0
        for i in range(action_repeat):
            img_rgb, reward, die, _ = self.env.step(action)
            # don't penalize "die state"
            if die:
                reward += 100
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 185.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die:
                break
        img_gray = self.rgb2gray(img_rgb)
        self.stack.pop(0)
        self.stack.append(img_gray)
        assert len(self.stack) == img_stack
        return np.array(self.stack), total_reward, done, die

    def render(self, *arg):
        self.env.render(*arg)

    @staticmethod
    def rgb2gray(rgb, norm=True):
        # rgb image -> gray [0, 1]
        gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
        if norm:
            # normalize
            gray = gray / 128. - 1.
        return gray

    @staticmethod
    def reward_memory():
        # record reward for last 100 steps
        count = 0
        length = 100
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory

In [6]:
class Net(nn.Module):
    """
    Actor-Critic Network for PPO
    """

    def __init__(self):
        super(Net, self).__init__()
        self.cnn_base = nn.Sequential(  # input shape (4, 96, 96)
            nn.Conv2d(args.img_stack, 8, kernel_size=4, stride=2),
            nn.ReLU(),  # activation
            nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
            nn.ReLU(),  # activation
            nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
            nn.ReLU(),  # activation
            nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
            nn.ReLU(),  # activation
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )  # output shape (256, 1, 1)
        self.v = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1))
        self.fc = nn.Sequential(nn.Linear(256, 100), nn.ReLU())
        self.alpha_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
        self.apply(self._weights_init)

    @staticmethod
    def _weights_init(m):
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
            nn.init.constant_(m.bias, 0.1)

    def forward(self, x):
        x = self.cnn_base(x)
        x = x.view(-1, 256)
        v = self.v(x)
        x = self.fc(x)
        alpha = self.alpha_head(x) + 1
        beta = self.beta_head(x) + 1

        return (alpha, beta), v

AttributeError: ignored

In [0]:
class ActorCriticModel(tf.keras.Model):
    """
    Actor-Critic Network for PPO
    """

    def __init__(self):
        super(ActorCriticModel, self).__init__()
        self.cnn_base = tf.keras.Sequential() # # input shape (4, 96, 96)
        self.cnn_base.add(Conv2D(8,kernel_size=4, strides=2, activation='relu', input_shape=(96, 96, 4),)) # data_format='channels_first'))
        self.cnn_base.add(Conv2D(16,kernel_size=3, strides=2, activation='relu',)) #data_format='channels_first')) # (8, 47, 47) 
        self.cnn_base.add(Conv2D(32,kernel_size=3, strides=2, activation='relu',)) # data_format='channels_first')) # (16, 23, 23)
        self.cnn_base.add(Conv2D(64,kernel_size=3, strides=2, activation='relu',)) # data_format='channels_first')) # (32, 11, 11)
        self.cnn_base.add(Conv2D(128,kernel_size=3, strides=1, activation='relu',)) # data_format='channels_first')) # (64, 5, 5)
        self.cnn_base.add(Conv2D(256,kernel_size=3, strides=1, activation='relu',)) # data_format='channels_first')) # (128, 3, 3)
        # output shape (256, 1, 1)

        print(self.cnn_base.summary())
        self.v = tf.keras.Sequential([ # input shape 256
          Dense(100,activation='relu',input_shape=(256,)),
          Dense(1)
        ])
        self.fc = tf.keras.Sequential([ # input shape 256
          Dense(100,input_shape=(256,))
        ])
        self.alpha_head = tf.keras.Sequential([ # input shape 100
          Dense(3, activation='softplus', input_shape=(100,))
        ])
        self.beta_head = tf.keras.Sequential([ #input shape 100
          Dense(3, activation='softplus', input_shape=(100,)),
        ])
        # self.apply(self._weights_init)

    # @staticmethod
    # def _weights_init(m):
    #     if isinstance(m, nn.Conv2d):
    #         nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
    #         nn.init.constant_(m.bias, 0.1)

    def call(self, x):
        x = self.cnn_base(x)
        # print("cnn_base:",x.shape)
        x = tf.reshape(x,[-1,256])
        # x = tf.expand_dims(x,0)
        # print("squeezed:",x.shape)
        v = self.v(x)
        x = self.fc(x)
        alpha = self.alpha_head(x) + 1
        beta = self.beta_head(x) + 1
        return (alpha, beta), v

In [0]:
class Agent():
    """
    Agent for training
    """
    max_grad_norm = 0.5
    clip_param = 0.1  # epsilon in clipped loss
    ppo_epoch = 10
    buffer_capacity, batch_size = 150, 32 # 2000, 128

    def __init__(self):
        self.training_step = 0
        self.net = ActorCriticModel()
        self.buffer = np.empty(self.buffer_capacity, dtype=transition)
        self.counter = 0

        self.optimizer = optimizers.Adam(lr=1e-3)
        self.loss = tf.keras.losses.Huber()
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')

    def select_action(self, state):
        # state = torch.from_numpy(state).double().to(device).unsqueeze(0)
        # with torch.no_grad():
        alpha, beta = self.net(state)[0]
        dist = tfp.distributions.Beta(alpha, beta)
        action = dist.sample()
        a_logp = dist.log_prob(action)
        a_logp = tf.reduce_sum(a_logp, axis=1)

        action = tf.squeeze(action).numpy()
        a_logp = a_logp.numpy()
        return action, a_logp

    def save_param(self):
        torch.save(self.net.state_dict(), 'param/ppo_net_params.pkl')

    def store(self, transition):
        self.buffer[self.counter] = transition
        self.counter += 1
        if self.counter == self.buffer_capacity:
            self.counter = 0
            return True
        else:
            return False

    def update2(self,):
        self.training_step += 1

        s = tf.constant(self.buffer['s'], dtype=tf.float64)
        a = tf.constant(self.buffer['a'], dtype=tf.float64)
        r = tf.constant(self.buffer['r'], dtype=tf.float64)
        r = tf.reshape(r,(-1, 1))
        s_ = tf.constant(self.buffer['s_'], dtype=tf.float64)

        old_a_logp = tf.constant(self.buffer['a_logp'], dtype=tf.float64)#.to(device).view(-1, 1)
        print("s shape:",s.shape)
        # with torch.no_grad():
        target_v = r + gamma * self.net(s_)[1]
        adv = target_v - self.net(s)[1]
        # dataset = list(dataset)
        dataset = tf.data.Dataset.from_tensor_slices({"s": s, "a": a, "old_a_logp": old_a_logp,
                                                      "target_v":target_v, "adv":adv}).batch(self.batch_size).as_numpy_iterator()
        dataset = list(dataset)
        for _ in range(self.ppo_epoch):
            for data in dataset: #BatchSampler(SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False):
                # print("Input shape1:",s[index].shape)
                s = data["s"]
                a = data["a"]
                old_a_logp = data["old_a_logp"]
                target_v = data["target_v"]
                adv = data["adv"]
                self.train_step2(s, a , old_a_logp, adv, target_v)

    def train_step2(self,s, a, old_a_logp, adv, target_v):
        with tf.GradientTape() as tape:
            # training=True is only needed if there are layers with different
            # behavior during training versus inference (e.g. Dropout).
            s = tf.constant(s,dtype=tf.float64)
            a = tf.constant(a,dtype=tf.float64)
            old_a_logp = tf.constant(old_a_logp,dtype=tf.float64)
            adv = tf.constant(adv,dtype=tf.float64)
            target_v = tf.constant(target_v,dtype=tf.float64)
            # print("Input shape:",s.shape)

            (alpha, beta), v = self.net(s)
            dist = tfp.distributions.Beta(alpha, beta)
            # a_logp = dist.log_prob(a).sum(axis=1, keepdim=True)
            a_logp = dist.log_prob(a)
            a_logp = tf.reduce_sum(a_logp, axis=1, keepdims=True)
            
            ratio = tf.math.exp(a_logp - old_a_logp)
            surr1 = ratio * adv
            surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv
            action_loss = tf.reduce_mean(-tf.math.minimum(surr1, surr2))
            # print("v:",v.dtype,"target_v:",target_v.dtype)
            # print("v:",type(v),"target_v:",type(target_v))
            v = tf.squeeze(v)
            target_v = tf.squeeze(target_v)
            value_loss = self.loss(v, target_v)
            # print("Action Loss:",action_loss,"Value Loss:",value_loss)
            loss = action_loss + 2. * value_loss
            # loss = loss_object(labels, predictions)
        gradients = tape.gradient(loss, self.net.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.net.trainable_variables))
        self.train_loss(loss)

In [36]:
agent = Agent()
env = Env()
# if args.vis:
#     draw_reward = DrawLine(env="car", title="PPO", xlabel="Episode", ylabel="Moving averaged episode reward")

training_records = []
running_score = 0
state = env.reset()
for i_ep in range(100000):
    score = 0
    state = env.reset()
    state = np.moveaxis(state, 0, -1)
    state = np.expand_dims(state, axis=0).astype(np.float64)
    for t in range(1000):
        # print(state.shape)
        action, a_logp = agent.select_action(state)
        state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))
        state_ = np.moveaxis(state_, 0, -1)
        state_ = np.expand_dims(state_, axis=0).astype(np.float64)
        if render:
            env.render()
        if agent.store((state, action, a_logp, reward, state_)):
            print('updating')
            # dataset = tf.data.Dataset.range(150).batch(32).as_numpy_iterator()
            # dataset = list(dataset)
            # agent.train_step(dataset)
            agent.update2()
        score += reward
        state = state_
        if done or die:
            break
    running_score = running_score * 0.99 + score * 0.01

    if i_ep % log_interval == 0 or True:
        # if args.vis:
        #     draw_reward(xdata=i_ep, ydata=running_score)
        loss = agent.train_loss.result()
        print('Ep {}\tLast score: {:.2f}\tMoving average score: {:.2f}\tLoss: {:.5f}'.format(i_ep, score, running_score, loss))
        # agent.save_param()
    if running_score > env.reward_threshold:
        print("Solved! Running reward is now {} and the last episode runs to {}!".format(running_score, score))
        break

Model: "sequential_60"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_72 (Conv2D)           (None, 47, 47, 8)         520       
_________________________________________________________________
conv2d_73 (Conv2D)           (None, 23, 23, 16)        1168      
_________________________________________________________________
conv2d_74 (Conv2D)           (None, 11, 11, 32)        4640      
_________________________________________________________________
conv2d_75 (Conv2D)           (None, 5, 5, 64)          18496     
_________________________________________________________________
conv2d_76 (Conv2D)           (None, 3, 3, 128)         73856     
_________________________________________________________________
conv2d_77 (Conv2D)           (None, 1, 1, 256)         295168    
Total params: 393,848
Trainable params: 393,848
Non-trainable params: 0
_______________________________________________



Track generation: 1087..1369 -> 282-tiles track
Ep 0	Last score: 99.64	Moving average score: 1.00	Loss: 0.00000
Track generation: 964..1212 -> 248-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1176..1474 -> 298-tiles track
updating
s shape: (150, 96, 96, 4)
Input shape: (32, 96, 96, 4)
v: <dtype: 'float64'> target_v: <dtype: 'float64'>
v: <class 'tensorflow.python.framework.ops.EagerTensor'> target_v: <class 'tensorflow.python.framework.ops.EagerTensor'>
Action Loss: tf.Tensor(0.034350731854293416, shape=(), dtype=float64) Value Loss: tf.Tensor(0.9159199595451355, shape=(), dtype=float64)
Input shape: (32, 96, 96, 4)
v: <dtype: 'float64'> target_v: <dtype: 'float64'>
v: <class 'tensorflow.python.framework.ops.EagerTensor'> target_v: <class 'tensorflow.python.framework.ops.EagerTensor'>
Action Loss: tf.Tensor(0.24088300420105646, shape=(), dtype=float64) Value Loss: tf.Tensor(0.7645790576934814, shape=(), dtype=float64)
Input shape

KeyboardInterrupt: ignored