## This notebook implements the DDPG algorithm for continuous control 
![](images/DDPG.png)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
import gym
import pybullet_envs

import numpy as np

import collections
import matplotlib.pyplot as plt

env_name = "HopperBulletEnv-v0"
env = gym.make(env_name)
state = env.reset()
print(env.observation_space.shape)
print(env.action_space.shape)

from torch.utils.tensorboard import SummaryWriter
device = 'cpu'

In [None]:
class Critic(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(in_features=input_shape, out_features=64)
        self.fc2 = nn.Linear(in_features=64 + n_actions , out_features=32)
        self.out = nn.Linear(in_features=32, out_features=1)
        
    def _flat_conv_size(self, image_shape):
        zeros = torch.zeros(image_shape)
        x = self.conv1(zeros.unsqueeze(0))
        x = self.conv2(x)
        x = self.conv3(x)
        size = torch.prod(torch.tensor(x.shape)).data.item()
        return size
    
    def forward(self,x, a):
        # dense layer 1
        x = F.relu(self.fc1(x))
        # dense layer 2
        if (type(a) == float):
          a = torch.tensor([[a]])
          x = F.relu(self.fc2(torch.cat([x, a], dim=1)))
        else:
          x = F.relu(self.fc2(torch.cat([x, a], dim=1)))
        # output layer
        x = self.out(x)
        return x

In [None]:
class Actor(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(in_features=input_shape, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=n_actions)
        
    def _flat_conv_size(self, image_shape):
        zeros = torch.zeros(image_shape)
        x = self.conv1(zeros.unsqueeze(0))
        x = self.conv2(x)
        x = self.conv3(x)
        size = torch.prod(torch.tensor(x.shape)).data.item()
        return size
    
    def forward(self,x):
        # dense layer 1
        x = F.relu(self.fc1(x))
        # dense layer 2
        x = F.relu(self.fc2(x))
        # output layer
        x = torch.tanh(self.out(x))
        return x

In [None]:
class ReplayBuffer():
    def __init__(self, length):
        self.length = length
        self.buffer = collections.deque(maxlen=length)
        
    def sample(self, size):
        indices = np.random.randint(low=0, high=self.length,size=size)
        return np.array(self.buffer)[indices]
    
    def append(self, e):
        self.buffer.append(e)
        
    def __len__(self):
        return len(self.buffer)
    
    def __repr__(self):
        return "Replay Buffer \n {}".format(self.buffer)

In [None]:
class SampleGeneration():
    @staticmethod
    @torch.no_grad()
    def generate_samples(network, env='CartPole-v1', N=4):
        states_N = []
        actions_N = []
        rewards_N = []
        env = gym.make(env)
        for trajectory in range(N):
            state = env.reset()
            states_N.append(state)
            rewards = []
            done = False
            while not done:
                state_t = torch.tensor(state.astype(np.float32)).unsqueeze(0)
                action_logits = network(state_t)
                actions_prob = network.softmax(action_logits)
                action = torch.multinomial(actions_prob, 1).item()
                actions_N.append(action)
                state, reward, done, _ = env.step(action)
                rewards.append(reward)
                if not done:
                    states_N.append(state)
            rewards_N.append(np.array(rewards))
            state_stack = np.stack(states_N)
        return (state_stack, rewards_N, np.array(actions_N))
    @torch.no_grad()
    def generate_samples_from_buffer(buffer, mini_batch_size):
        return buffer.sample(mini_batch_size)

In [None]:
class ReturnEstimator():
    # reward_to_go
    @staticmethod
    def estimate_return(rewards):
        gamma = 0.99
        res = [[] for i in range(len(rewards))]
        for i in range(len(rewards)):
            sum_r = 0.0
            for r in rewards[i]:
                sum_r *= gamma
                sum_r += r
                res[i].append(sum_r)
            res[i].reverse()
        return np.array(res)
    @staticmethod
    @torch.no_grad()
    def calc_y(act_t, crt_t, states,rewards, n_states ,dones ,gamma=0.99):
        states_t = torch.FloatTensor(states).to(torch.device(device))
        n_states_t = torch.FloatTensor(n_states).to(torch.device(device))
        rewards_t = torch.FloatTensor(rewards).to(torch.device(device))
        
        actions = act_t(n_states_t)
        q_next_state = crt_t(n_states_t, actions)
#         print('q shape',q_next_state.shape)
        
        done_mask = np.ones(rewards.shape)
        done_mask[dones] = 0
        done_mask_t = torch.FloatTensor(done_mask)
        done_mask_t = done_mask_t.reshape(-1,1)
#         print('done shape',done_mask_t.shape)
        rewards_t = rewards_t.reshape(-1,1)
#         print('rewards tensor shape',rewards_t.shape)
        y = rewards_t + gamma * q_next_state * done_mask_t
#         print('y shape',y.shape)
#         print('y',y)
        return y
    @staticmethod
    def fit_Q(network, states, actions, y, optimizer, t):
        optimizer.zero_grad()
        states_t = torch.FloatTensor(states).to(torch.device(device))
        actions_t = torch.FloatTensor(actions).to(torch.device(device))

        Q_output = network(states_t, actions_t).to(torch.device(device))
#         print('Q output',Q_output)
#         print('y mean', y.mean())
#         print("Q_output mean",Q_output.mean())
        loss = F.mse_loss(Q_output, y.detach())
        tb.add_scalar('critic_loss', loss, t)
        tb.add_scalar('Q mean', Q_output.mean(), t)
        loss.backward()
        optimizer.step()

In [None]:
def improve_policy(act_network, crt_network,states, act_optimizer, t):
    act_optimizer.zero_grad()
    states_t = torch.FloatTensor(states).to(torch.device(device))
    actions = act_network(states_t)
#     actions = torch.clamp(actions, -1, 1)
    loss = - crt_network(states_t, actions).to(torch.device(device)).mean()
    loss.backward()
    act_optimizer.step()
    tb.add_scalar('actor_loss', loss, t)

In [None]:
# OU_process for exploration noise 
@torch.no_grad()
def OU_process(mu_t, prev_noise, ou_mu = 0.0,theta=0.15, sigma=0.2, eps=1):
    mu = mu_t.cpu().detach().numpy()
    action_size = mu.shape[0]
#     print(action_size)
    ou_noise = prev_noise + theta*(ou_mu - prev_noise) + sigma * np.random.normal(size=action_size)
    action = mu + eps * ou_noise
    return action, ou_noise

In [None]:
@torch.no_grad()
def Play_step(network, env, obs, buffer,e, total_reward, prev_noise):
    obs_t = torch.FloatTensor(obs).to(torch.device(device))
    mu_t = network(obs_t)
    if prev_noise is None:
        prev_noise = np.zeros(shape=mu_t.shape[0])
    action, prev_noise = OU_process(mu_t, prev_noise)
#     action = action.clip(-1,1)
    n_obs, reward, done, _ = env.step(action)
    buffer.append(e(obs, action, reward, n_obs, done))
    total_reward += reward
    return n_obs, done, total_reward, prev_noise

In [None]:
@torch.no_grad()
def test_policy(network, env_name, render=False):
    runs = 5
    total_reward = 0.0
    env = gym.make(env_name)
    for run in range(runs):
        state = env.reset()
        done = False
        while not done:
            if render:
                env.render()
            state_t = torch.FloatTensor(state).to(torch.device(device))
            action = network(state_t)
            action = action.cpu().numpy()
#             action = action.clip(-1,1)
            state, reward, done, _ = env.step(action)
            total_reward += reward
    env.close()
    return total_reward / runs

In [None]:
def sample_random_minibatch(buffer, sample_size):
    batch = buffer.sample(sample_size)
    states = []
    actions = []
    rewards = []
    n_states = []
    dones = []
    for e in batch:
        states.append(e[0])
        actions.append(e[1])
        rewards.append(e[2])
        n_states.append(e[3])
        dones.append(e[4])
    return np.array(states), np.array(actions), np.array(rewards), np.array(n_states), np.array(dones)

In [None]:
# https://stackoverflow.com/questions/48560227/how-to-take-the-average-of-the-weights-of-two-networks
@torch.no_grad()
def polyak(net, target_net, taw=0.001):
    params1 = net.named_parameters()
    params2 = target_net.named_parameters()

    dict_params2 = dict(params2)

    for name1, param1 in params1:
        if name1 in dict_params2:
            dict_params2[name1].data.copy_(taw*param1.data + (1-taw)*dict_params2[name1].data)

    target_net.load_state_dict(dict_params2)

In [None]:
# Hyperparameters
lr = 1e-3
batch_size = 64
Buffer_size = 100000
target_sync = 1000

# Create the environment ,the network, and the optimizer
env_name = 'HopperBulletEnv-v0'
env = gym.make(env_name)
obs_space = env.observation_space.shape
action_space = env.action_space.shape

act = Actor(obs_space[0], action_space[0]).to(torch.device(device))
crt = Critic(obs_space[0], action_space[0]).to(torch.device(device))

act_target = Actor(obs_space[0], action_space[0]).to(torch.device(device))
crt_target = Critic(obs_space[0], action_space[0]).to(torch.device(device))

act_optimizer = opt.Adam(act.parameters(), lr=lr)
crt_optimizer = opt.Adam(crt.parameters(), lr=lr)


buffer = ReplayBuffer(Buffer_size)

experience = collections.namedtuple('Experience', ['obs', 'action', 'reward', 'next_obs', 'done'])

t = 0
rewards_100 = []
# Writitng to the summary 
obs = env.reset()
tb = SummaryWriter(comment=f"-DDPG-env={env_name}-lr={lr}")
# Add the actor graph to the network 
tb.add_graph(act, torch.FloatTensor(obs).unsqueeze(0).to(torch.device(device)))
reward = 0
max_reward_100 = 0
# Exploration noise
noise = None
eps_len = 0
while True:
    t += 1
    # print(t)
    # Fill the replay buffer
    while len(buffer) < Buffer_size:
        obs = env.reset()
        for i in range(1000):
            n_obs, done, reward, noise = Play_step(act, env, obs, buffer, experience, reward, noise)
            obs = n_obs
            if done:
                done = False
                obs = env.reset()
                reward = 0  
                noise = None
                break
    obs, done, reward, noise = Play_step(act, env, obs, buffer, experience, reward, noise)
    eps_len += 1
    if done:
        obs = env.reset()
        noise = None
        # print("done")
        tb.add_scalar("eps steps", eps_len, t)
        tb.add_scalar("training reward", reward, t)
        eps_len = 0
        # print("reward",reward)
        reward = 0
        test_reward = test_policy(act, env_name)
        rewards_100.append(test_reward)
        tb.add_scalar("testing reward", test_reward, t)
        # Write the average reward of the last 100 episode to tensorboard
        if len(rewards_100) == 100:
            if (sum(rewards_100)/100) > max_reward_100:
                torch.save(act.state_dict(),"act_network.pt")
                max_reward_100 = sum(rewards_100)/100
                print("new max reward = ",max_reward_100)
            reward_100 = sum(rewards_100)/100
            print(f"{t}, testing_reward: {test_reward}, training_reward: {rewards_100[-1]}, mean_rewad: {reward_100}")
            tb.add_scalar('reward_100', reward_100, t)
            rewards_100 = []
            if reward_100 >= 800:
                print("done")
                break

    # check the buffer size
    # print(len(buffer))
    if len(buffer) < Buffer_size:
        continue
    # sample a batch
    states, actions, rewards, n_states, dones = sample_random_minibatch(buffer, batch_size)
#     states = normalize(states,mean,std)
#     n_states = normalize(n_states,mean,std)
    # calculate y
    y = ReturnEstimator.calc_y(act_target, crt_target, states, rewards, n_states, dones)
    # Update the Crtitc network 
    ReturnEstimator.fit_Q(crt, states, actions, y, crt_optimizer, t)
    # update the actor network
    improve_policy(act, crt, states, act_optimizer,t)
    # update the target networks by applying polyak average
    polyak(act, act_target, taw=0.001)
    polyak(crt, crt_target, taw = 0.001)
#     for name, param in act.named_parameters():
#             tb.add_histogram(f'actor_{name}', param, t)
#             tb.add_histogram(f'actor_{name}.grad', param.grad, t)
#     for name, param in crt.named_parameters():
#             tb.add_histogram(f'critic_{name}', param, t)
#             tb.add_histogram(f'crirtc_{name}.grad', param.grad, t)
    # update the target network by copying the weights
#     if t%target_sync == 0:
#         print("Target networks updated at step t = ", t)
#         crt_target.load_state_dict(crt.state_dict())
#         act_target.load_state_dict(act.state_dict())
        
    

## Training Dynamics
<img  src="images/Q_actor_loss.png" > 
<img  src="images/critic_loss_eps_steps.png" > 
<img  src="images/rewards.png" > 

## Agent Performence 
<video  controls width="320" height="240" src="recordings/test.mp4" /> 

In [None]:
# Test the agent and save the record the performence
env = gym.wrappers.Monitor(gym.make(env_name),"/home/faisal/Documents/ML practice/Deep Learning/pytorch/DDPG/recordings")

obs_space = env.observation_space.shape
action_space = env.action_space.shape
act = Actor(obs_space[0], action_space[0]).to(torch.device(device))
act.load_state_dict(torch.load("act_network.pt"))
state = env.reset()
done = False
total_reward = 0
while not done:
    state_t = torch.FloatTensor(state).to(torch.device(device))
    action = act(state_t)
    action = action.cpu().detach().numpy()
    state, reward, done, _ = env.step(action)
    total_reward += reward
total_reward

## References 
Timothy P. Lillicrap et.al Continuous control with deep reinforcement learning https://arxiv.org/abs/1509.02971 <br>
CS 285 at UC Berkeley
Deep Reinforcement Learning lecture 8 http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-8.pdf