In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import deque, namedtuple
from matplotlib import pyplot as plt
import numpy as np
import random
import gym
import pdb

from src.utils.OUNoise import OUNoise 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Define Networks

In [4]:
class Actor(nn.Module):
    def __init__(self, state_space, action_space):
        super(Actor, self).__init__()
        
        self.noise = OUNoise(action_space)
        
        self.head = nn.Sequential(
            nn.Linear(state_space, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, action_space),
            nn.Tanh()
        )
    
    def forward(self, x):
        return self.head(x)
    
    def act(self, state, add_noise=True):
        
        state = torch.from_numpy(state).float()
        
        action = self.forward(state).cpu().data.numpy()
        if add_noise:
            action += self.noise.noise()

        return np.clip(action, -1, 1)


In [5]:
# class Critic(nn.Module):
#     def __init__(self, state_space, action_space):
#         super(Critic, self).__init__()
        
#         self.head = nn.Sequential(
#             nn.Linear(state_space, 1024),
#             nn.ReLU(),
#         )
        
#         self.body = nn.Sequential(
#             nn.Linear(1024 + action_space, 512),
#             nn.ReLU(),
#             nn.Linear(512, 300),
#             nn.ReLU(),
#             nn.Linear(300, 1),
#         )
        
#         self.single = nn.Sequential(
#             nn.Linear(state_space + action_space, 256),
#             nn.ReLU(),
#             nn.Linear(256, 256),
#             nn.ReLU(),
#             nn.Linear(256, 1),
#         )
    
#     def forward(self, x, actions):
#         actions = torch.tensor(actions).float()
#         x = torch.tensor(x).float()
        
#         x = self.head(x)
#         x = self.body(torch.cat((x, actions), dim=1))
        
# #         x = self.single(torch.cat((x, actions), dim=1))
#         return x

In [6]:
class Critic(nn.Module):

    def __init__(self, obs_dim, action_dim):
        super(Critic, self).__init__()

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.linear1 = nn.Linear(self.obs_dim, 1024)
        self.linear2 = nn.Linear(1024 + self.action_dim, 512)
        self.linear3 = nn.Linear(512, 300)
        self.linear4 = nn.Linear(300, 1)

    def forward(self, x, a):
        x = F.relu(self.linear1(x))
        xa_cat = torch.cat([x,a], 1)
        xa = F.relu(self.linear2(xa_cat))
        xa = F.relu(self.linear3(xa))
        qval = self.linear4(xa)

        return qval

## Create environment with Agents

In [7]:
import gym

# env = gym.make("MountainCarContinuous-v0")
env = gym.make("Pendulum-v0")

state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

print("State space: {}".format(state_space))
print("Action space: {}".format(action_space))

State space: 3
Action space: 1


  result = entry_point.load(False)


In [8]:
actor = Actor(state_space, action_space).to(device)
critic = Critic(state_space, action_space).to(device)

actor_target = Actor(state_space, action_space).to(device)
critic_target = Critic(state_space, action_space).to(device)

### Replay Buffer

In [9]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=int(buffer_size))
        self.Experience = namedtuple("experience", ["state", "action", "reward", "next_state", "done"])
    
    def add(self, state, action, reward, next_state, done):
        e = self.Experience(state, action, reward, next_state ,done)
        self.buffer.append(e)
    
    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size)
        
#         states = torch.stack([ torch.tensor(exp.state) for exp in samples]).float().to(device)
#         actions = torch.stack([ torch.tensor(exp.action) for exp in samples]).float().to(device)
#         rewards = torch.stack([ torch.tensor(exp.reward) for exp in samples]).float().to(device)
#         next_states = torch.stack([ torch.tensor(exp.next_state) for exp in samples]).float().to(device)
#         dones = torch.stack([ torch.tensor(exp.done) for exp in samples]).float().to(device)
        
        states = [ exp.state for exp in samples]
        actions = [ exp.action for exp in samples]
        rewards = [ exp.reward for exp in samples]
        next_states = [ exp.next_state for exp in samples]
        dones = [ exp.done for exp in samples]
        
        return (states, actions, rewards, next_states, dones)
    
    def __len__(self):
        return len(self.buffer)

# class ReplayBuffer:

#   def __init__(self, max_size):
#       self.max_size = max_size
#       self.buffer = deque(maxlen=max_size)

#   def add(self, state, action, reward, next_state, done):
#       experience = (state, action, np.array([reward]), next_state, done)
#       self.buffer.append(experience)

#   def sample(self, batch_size):
#       state_batch = []
#       action_batch = []
#       reward_batch = []
#       next_state_batch = []
#       done_batch = []

#       batch = random.sample(self.buffer, batch_size)

#       for experience in batch:
#           state, action, reward, next_state, done = experience
#           state_batch.append(state)
#           action_batch.append(action)
#           reward_batch.append(reward)
#           next_state_batch.append(next_state)
#           done_batch.append(done)

#       return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)

#   def __len__(self):
#       return len(self.buffer)

## Computing loss and updating Networks

In [10]:
actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3)
critic_optimiser = optim.Adam(critic.parameters(), lr=1e-3)

In [18]:
def learn():

    state_batch, action_batch, reward_batch, next_state_batch, masks = mem.sample(batch_size)
    
#     pdb.set_trace()
    
    state_batch = torch.FloatTensor(state_batch)
    action_batch = torch.FloatTensor(action_batch)
    reward_batch = torch.FloatTensor(reward_batch)
    next_state_batch = torch.FloatTensor(next_state_batch)
    masks = torch.FloatTensor(masks)

    update_critic(
        states=state_batch,
        actions=action_batch,
        rewards=reward_batch,
        next_states=next_state_batch,
        dones=masks
    )
    update_actor(states=state_batch)
    update_target_networks()

### Actor Update

<img src="./img/ddpg/actor_update.png" alt="Drawing" style="height: 50px;"/>

In [12]:
def update_actor(states):
    actions_pred = actor(states)  
#     actions_pred = torch.from_numpy(actions_pred).float()
    loss = -critic(states, actions_pred).mean()
    
    actor_optimiser.zero_grad()
    loss.backward()
    actor_optimiser.step()

### Critic Update

Critic Loss:
<img src="./img/ddpg/critic_loss.png" alt="Drawing" style="height: 30px;"/>

Critic $y_i$:
<img src="./img/ddpg/critic_yi.png" alt="Drawing" style="height: 35px;"/>

In [13]:
def update_critic(states, actions, rewards, next_states, dones):
#     pdb.set_trace()
    next_actions = actor_target.forward(next_states)
    
#     y_i =  rewards + ( gamma * critic_target(next_states, next_actions).squeeze() * (1-dones ))
#     expected_Q = critic(states, actions).squeeze()
    
    y_i =  rewards + ( gamma * critic_target(next_states, next_actions) * (1-dones ))
    expected_Q = critic(states, actions)

    loss = F.mse_loss(y_i, expected_Q)
    
    critic_optimiser.zero_grad()
    loss.backward()
    critic_optimiser.step()

### Copy Weights Over

In [14]:
def update_target_networks():
    for target, local in zip(actor_target.parameters(), actor.parameters()):
        target.data.copy_(tau*local.data + (1.0-tau)*target.data)
        
    for target, local in zip(critic_target.parameters(), critic.parameters()):
        target.data.copy_(tau*local.data + (1.0-tau)*target.data)

## Runner

In [15]:
max_e = 30
max_t = 500
buffer_size = 100000
batch_size = 32
learn_every = 1

gamma = 0.99
tau = 1e-2

In [16]:
mem = ReplayBuffer(buffer_size)

score_log = []
score_window = deque(maxlen=100)

In [None]:
for episode in range(max_e):
    state = env.reset()
    score = 0
    for t in range(max_t):
        action = actor.act(state, add_noise=False)
        next_state, reward, done, _ = env.step(action)
#         mem.add(state, action, reward, next_state, done)
        mem.add(state, action, reward, next_state, done)
        score += reward


        
        if len(mem) > batch_size and t % learn_every == 0:
            learn()

        if done:
            break;
        
        state = next_state
    
    score_log.append(score)
    score_window.append(score)
    
#     print("\rEpsiode: {:.1f}\tWindow Score: {:.4f}\tScore: {:.4f}".format(episode, np.mean(score_window), score), end="")    
#     if (episode % 100 == 0):
#         print("\rEpsiode: {:.1f}\tWindow Score: {:.4f}\tScore: {:.4f}".format(episode, np.mean(score_window), score))
    print("\rEpsiode: {:.1f}\tWindow Score: {:.4f}\tScore: {:.4f}".format(episode, np.mean(score_window), score))



> <ipython-input-18-e7c3897805cb>(7)learn()
-> state_batch = torch.FloatTensor(state_batch)
(Pdb) state_batch
[array([-0.71126355,  0.70292543, -7.86754903]), array([-0.97205774, -0.2347419 ,  0.84490126]), array([-0.9970827 , -0.07632877, -4.10122866]), array([0.58574083, 0.81049841, 5.98689864]), array([-0.16776309,  0.98582734, -2.25997446]), array([ 0.22030381,  0.9754313 , -4.18445993]), array([-0.84392079, -0.5364678 ,  3.77926961]), array([ 0.03187483, -0.99949187,  1.12673023]), array([-0.95132502, -0.3081894 ,  1.45544488]), array([-0.51708513, -0.85593397,  7.66735147]), array([-0.03191645, -0.99949054, -3.93868436]), array([-0.97107518,  0.23877395, -6.09972824]), array([-0.97547275,  0.22012022,  6.41757077]), array([ 0.7450116 , -0.66705151, -1.03615545]), array([0.91424705, 0.40515717, 1.89215589]), array([-0.99091649, -0.13447867, -0.82814506]), array([-0.25477774, -0.96699964,  3.71658265]), array([ 0.71136657, -0.70282117, -0.58231535]), array([ 0.24803471, -0.96875115

In [None]:
plt.plot(score_log)