# D4PG

DDPG with 
- dueling distributional critic network. 
- Prioritized replay 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import math
import operator
import numpy as np
import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import imageio
import copy

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import gym

# Exploration Noise

In [2]:
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state

In [3]:
class GaussianNoise(object):
    def __init__(self, dimension, num_epochs, mu=0.0, var=1):
        self.mu = mu
        self.var = var
        self.dimension = dimension
        self.epochs = 0
        self.num_epochs = num_epochs
        self.min_epsilon = 0.01 # minimum exploration probability
        self.epsilon = 0.3
        self.decay_rate = 5.0/num_epochs # exponential decay rate for exploration prob
        self.iter = 0

    def sample(self):
        x = self.epsilon * np.random.normal(self.mu, self.var, size=self.dimension)
        return x

    def reset(self):
        self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon)*np.exp(-self.decay_rate*self.iter)

In [44]:
class AdaptiveParamNoise(object):
    def __init__(self, initial_stddev=0.1,desired_action_stddev=0.2,adaptation_coefficient=1.01):

        self.initial_stddev = initial_stddev
        self.desired_action_stddev = desired_action_stddev
        self.adaptation_coefficient = adaptation_coefficient

        self.current_stddev = initial_stddev

    def adapt(self,distance):
        if distance > self.desired_action_stddev:
            # Decrease stddev
            self.current_stddev /= self.adaptation_coefficient
        else:
            # Increase stddev
            self.current_stddev *= self.adaptation_coefficient

    def get_stats(self):
        stats = {
            'param_noise_stddev':self.current_stddev,
        }
        return stats
    
    def __repr__(self):
        fmt = "AdaptiveNoiseParam(initial_stddev={},desired_action_stddev={},adaptation_coefficient={})"
        return fmt.format(self.initial_stddev,self.desired_action_stddev,self.adaptation_coefficient)

    


# Networks

In [45]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

def hard_update(source,target):
    for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

In [150]:
class Critic(nn.Module):
    def __init__(self,seed,nS,nA,hidden_dims=(256,128)):
        super(Critic,self).__init__()
        self.seed = torch.manual_seed(seed)
        self.nS = nS
        self.nA = nA
        
        self.input_layer = nn.Linear(nS,hidden_dims[0])
        self.input_bn = nn.BatchNorm1d(hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(hidden_dims[0]+nA,hidden_dims[1]))
        for i in range(1,len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i],hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        # self.fc1 = nn.Linear(hidden_dims[0]+nA,hidden_dims[1])
        # self.fc1_bn = nn.BatchNorm1d(hidden_dims[1])
        self.output_layer = nn.Linear(hidden_dims[-1],1)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer))
        for hidden_layer in self.hidden_layers:
            hidden_layer.weight.data.uniform_(*hidden_init(hidden_layer))
        self.output_layer.weight.data.uniform_(-3e-3,3e-3)
        
    def forward(self,obs,action):
        # With batchnorm
        # xs = self.input_bn(F.relu(self.input_layer(state)))
        # x = torch.cat((xs,action),dim=1)
        # x = self.fc1_bn(F.relu(self.fc1(x)))
        assert isinstance(obs,torch.Tensor)
        xs = F.relu(self.input_layer(obs))
        x = torch.cat((xs, action), dim=-1)
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        return self.output_layer(x)

In [151]:
class Actor(nn.Module):
    def __init__(self,seed,nS,nA,hidden_dims=(256,128)):
        super(Actor,self).__init__()
        
        self.seed = torch.manual_seed(seed)
        self.nS = nS
        self.nA = nA
        self.std = nn.Parameter(torch.zeros(1, nA))

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.input_layer = nn.Linear(nS,hidden_dims[0])
        self.fc1 = nn.Linear(hidden_dims[0],hidden_dims[1])
        self.output_layer = nn.Linear(hidden_dims[1],nA)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer))
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.output_layer.weight.data.uniform_(-3e-3,3e-3)
        
    def forward(self,state):
        assert isinstance(state,torch.Tensor)
        x = F.relu(self.input_layer(state))
        x = F.relu(self.fc1(x))
        return torch.tanh(self.output_layer(x))

# Priority Experience Replay (PER)

In [190]:
"""
Priority Tree.
3 tiered tree structure containing
Root node (Object. sum of all lower values)
Intermediate Node (Object. Root as parent, sums a given slice of the priority array)
Priority Array (Array of priorities, length buffer_size)

The number of Intermediate nodes is calculated by the buffer_size / batch_size.

I_episode: current episode of training

Index: is calculated by i_episode % buffer_size. This loops the index after exceeding the buffer_size.

Indicies: (List) of memory/priority entries

intermediate_dict: maps index to intermediate node. Since each Intermediate node is responsible 
for a given slice of the priority array, given a particular index, it will return the Intermediate node
'responsible' for that index.

## Functions:

Add:
Calculates the priority of each TD error -> (abs(TD_error)+epsilon)**alpha
Stores the priority in the Priority_array.
Updates the sum_tree with the new priority

Update_Priorities:
Updates the index with the latest priority of that sample. As priorities can change over training
for a particular experience

Sample:
Splits the current priority_array based on the number of entries, by the batch_size.
Returns the indicies of those samples and the priorities.

Propogate:
Propogates the new priority value up through the tree
"""

class PriorityTree(object):
    def __init__(self,buffer_size,batch_size,alpha,epsilon):
        self.alpha = alpha
        self.epsilon = epsilon
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.batch_indicies = np.arange(0,self.batch_size)

        self.num_intermediate_nodes = math.ceil(buffer_size / batch_size)
        self.current_intermediate_node = 0
        self.root = Node(None)
        self.intermediate_nodes = [Intermediate(self.root,batch_size*x,batch_size*(x+1)) for x in range(self.num_intermediate_nodes)]
        self.priority_array = np.zeros(buffer_size)
        self.intermediate_dict = {}
        for index,node in enumerate(self.intermediate_nodes):
            for key in range((batch_size*(index+1))-batch_size,batch_size*(index+1)):
                self.intermediate_dict[key] = node
        print('Priority Tree: Batch Size {} Buffer size {} Number of intermediate Nodes {}'.format(batch_size,buffer_size,self.num_intermediate_nodes))
        
    def add(self,TD_error,index):
        priority = (abs(TD_error)+self.epsilon)**self.alpha
        self.priority_array[index] = priority
        # Update sum
        propogate(self.intermediate_dict[index],self.priority_array)
    
    def sample(self,index,limit):
        # Sample one experience uniformly from each slice of the priorities
        # if index >= self.buffer_size:
        #     indicies = [random.sample(list(range(sample*self.num_intermediate_nodes,(sample+1)*self.num_intermediate_nodes)),1)[0] for sample in range(self.batch_size)]
        #     # indicies = np.random.sample(np.arange(sample*self.num_intermediate_nodes,(sample+1)*self.num_intermediate_nodes))
        # else:
        spacing = np.linspace(0,limit-self.batch_size,self.batch_size,dtype=np.int)
        random_indicies = np.random.choice(self.batch_indicies,size=self.batch_size)
        indicies = random_indicies + spacing


        # interval = int(index / self.batch_size)
        # indicies = [random.sample(list(range(sample*interval,(sample+1)*interval)),1)[0] for sample in range(self.batch_size)]
#         print('indicies',indicies)
        priorities = self.priority_array[indicies]
        return priorities,indicies
    
    def update_priorities(self,TD_errors,indicies):
#         print('TD_errors',TD_errors)
#         print('TD_errors shape',TD_errors.shape)
        priorities = (np.abs(TD_errors)+self.epsilon)**self.alpha
#         print('priorities shape',priorities.shape)
#         print('indicies shape',len(indicies))
#         print('self.priority_array shape',self.priority_array.shape)
        self.priority_array[indicies] = priorities
        # Update sum
        nodes = [self.intermediate_dict[index] for index in indicies] 
        intermediate_nodes = set(nodes)
        [propogate(node,self.priority_array) for node in intermediate_nodes]
    
class Node(object):
    def __init__(self,parent):
        self.parent = parent
        self.children = []
        self.value = 0
            
    def add_child(self,child):
        self.children.append(child)
    
    def set_value(self,value):
        self.value = value
    
    def sum_children(self):
        return sum([child.value for child in self.children])
            
    def __len__(self):
        return len(self.children)

class Intermediate(Node):
    def __init__(self,parent,start,end):
        self.parent = parent
        self.start = start
        self.end = end
        self.value = 0
        parent.add_child(self)
    
    def sum_leafs(self,arr):
        return np.sum(arr[self.start:self.end])

def propogate(node,arr):
    if node.parent != None:
        node.value = node.sum_leafs(arr)
        propogate(node.parent,arr)
    else:
        node.value = node.sum_children()

In [191]:
"""
Priority Buffer HyperParameters
alpha(priority or w) dictates how biased the sampling should be towards the TD error. 0 < a < 1
beta(IS) informs the importance of the sample update

The paper uses a sum tree to calculate the priority sum in O(log n) time. As such, i've implemented my own version
of the sum_tree which i call priority tree.

We're increasing beta(IS) from 0.5 to 1 over time
alpha(priority) we're holding constant at 0.5
"""

class PriorityReplayBuffer(object):
    def __init__(self,buffer_size,batch_size,seed,alpha=0.5,beta=0.5,beta_end=1,beta_duration=1e+5,epsilon=7e-5,device=None):
        
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta
        self.beta_end = beta_end
        self.beta_duration = beta_duration
        self.beta_increment = (beta_end - beta) / beta_duration
        self.max_w = 0
        self.epsilon = epsilon
        self.TD_sum = 0
        self.index = 0
        if device == None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = device

        self.experience = namedtuple('experience',field_names=['state','action','reward','next_state','done'])
        self.sum_tree = PriorityTree(buffer_size,batch_size,alpha,epsilon)
        self.memory = {}
    
    def add(self,state,action,reward,next_state,done,TD_error):
        e = self.experience(state,action,reward,next_state,done)
        # add memory to memory and add corresponding priority to the priority tree
        self.memory[self.index] = e
        self.sum_tree.add(TD_error,self.index)
        self.index = (self.index + 1) % self.buffer_size 

    def sample(self):
        # We times the error by these weights for the updates
        # Super inefficient to sum everytime. We could implement the tree sum structure. 
        # Or we could sum once on the first sample and then keep track of what we add and lose from the buffer.
        # priority^a over the sum of the priorities^a = likelyhood of the given choice
        # Anneal beta
        self.update_beta()
        # Get the samples and indicies
        priorities,indicies = self.sum_tree.sample(self.index,len(self))
        # Normalize with the sum
        norm_priorities = priorities / self.sum_tree.root.value
        samples = [self.memory[index] for index in indicies]
        # Importance weights
        importances = [(priority * self.buffer_size)**-self.beta for priority in norm_priorities]
        self.max_w = max(self.max_w,max(importances))
        # Normalize importance weights
#         print('importances',importances)
#         print('self.max_w',self.max_w)
        norm_importances = [importance / self.max_w for importance in importances]
#         print('norm_importances',norm_importances)

        states, actions, rewards, next_states, dones = zip(*samples)

        states = torch.stack(states).float().to(self.device)
        actions = torch.stack(actions).float().to(self.device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(self.device)
        next_states = torch.stack(next_states).float().to(self.device)
        dones = torch.from_numpy(np.vstack(dones)).float().to(self.device)

        # states = torch.from_numpy(np.vstack([e.state for e in samples if e is not None])).float().to(self.device)
        # actions = torch.from_numpy(np.vstack([e.action for e in samples if e is not None])).float().to(self.device)
        # rewards = torch.from_numpy(np.vstack([e.reward for e in samples if e is not None])).float().to(self.device)
        # next_states = torch.from_numpy(np.vstack([e.next_state for e in samples if e is not None])).float().to(self.device)
        # dones = torch.from_numpy(np.vstack([e.done for e in samples if e is not None]).astype(int)).float().to(self.device)
        
        return (states,actions,rewards,next_states,dones),indicies,norm_importances

    def update_beta(self):
        self.beta += self.beta_increment
        self.beta = min(self.beta,self.beta_end)
    
    def __len__(self):
        return len(self.memory.keys())

In [192]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, device, buffer_size, batch_size):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size: maximum size of buffer
            batch_size: size of each training batch
        """
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"]
            )
        self.device = device

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self, batch_size=64):
        """Randomly sample a batch of experiences from memory."""
        samples = random.sample(self.memory, k=self.batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)

        states = torch.tensor(states).float().to(self.device)
        actions = torch.tensor(actions).float().to(self.device)
        rewards = torch.tensor(rewards).float().to(self.device)
        next_states = torch.tensor(next_states).float().to(self.device)
        dones = torch.tensor(dones).float().to(self.device)

        return (states, actions, rewards, next_states, dones)


    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

# Config

In [199]:
"""
Config file for loading hyperparams
"""

class Config(object):
    def __init__(self,agent):
        if agent == "d4pg":
            self.seed = 99
            self.name = agent
            self.num_agents = 2
            self.QLR = 0.001
            self.ALR = 0.0001
            self.gamma = 0.99
            self.L2 = 0 # 0.1
            self.tau=0.01 # 0.001
            self.noise_decay=0.995
            self.gae_lambda = 0.97
            self.clip_norm = 10
            # Buffer
            self.buffer_size = int(1e4)
            self.min_buffer_size = int(1e3)
            self.batch_size = 256
            # Priority Replay
            self.ALPHA = 0.6 # 0.7 or 0.6
            self.START_BETA = 0.5 # from 0.5-1
            self.END_BETA = 1
            # distributional
            self.N_atoms = 51
            self.v_min = -100
            self.v_max = 100
            self.delta_z = (self.v_min - self.v_max) / (self.N_atoms - 1)
            # pendulum
            self.action_low=-1.0 
            self.action_high=1.0
            self.winning_condition = -200
            # Training
            self.episodes = 4000
            self.tmax = 2000
            self.print_every = 4
            self.SGD_epoch = 1
            self.checkpoint_path = 'model_weights/ddpg.ckpt'
        else:
            raise ValueError('Agent not implemented')

# Agent

In [200]:
class D4PG(object):
    def __init__(self, nS, nA,config):
        self.nS = nS
        self.nA = nA
        self.action_low = config.action_low
        self.action_high = config.action_high
        self.seed = config.seed

        self.clip_norm = config.clip_norm
        self.tau = config.tau
        self.gamma = config.gamma
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.L2 = config.L2
        self.SGD_epoch = config.SGD_epoch
        
        # For distributional
        self.N_atoms = config.N_atoms # 51 for C51
        self.v_max = config.v_max # Max possible score
        self.v_min = config.v_min # Min possible score
        self.delta_z = (self.v_max - self.v_min) / float(self.N_atoms - 1)
        self.atoms = np.linspace(self.v_min,self.v_max,self.N_atoms)
        
        # noise
        self.noise = OUNoise(nA,config.seed)
        self.noise_scale = 1.0
        self.noise_decay = config.noise_decay

        # Priority Replay Buffer
        self.batch_size = config.batch_size
        self.buffer_size = config.buffer_size
        self.alpha = config.ALPHA
        self.beta = self.start_beta = config.START_BETA
        self.end_beta = config.END_BETA

        # actors networks
        self.actor = Actor(self.seed,nS, nA).to(self.device)
        self.actor_target = Actor(self.seed,nS, nA).to(self.device)

        # Param noise
        self.param_noise = AdaptiveParamNoise()
        self.actor_perturbed = Actor(self.seed,nS, nA).to(self.device)

        # critic networks
        self.critic = Critic(self.seed,nS, nA).to(self.device)
        self.critic_target = Critic(self.seed,nS, nA).to(self.device)

        # Copy the weights from local to target
        hard_update(self.critic,self.critic_target)
        hard_update(self.actor,self.actor_target)

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4, weight_decay=self.L2)
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=1e-3, weight_decay=self.L2)

        # replay buffer
        self.PER = PriorityReplayBuffer(self.buffer_size, self.batch_size,self.seed,alpha=self.alpha,device=self.device)

        # reset agent for training
        self.reset_episode()
        self.it = 0

    def save_weights(self,path):
        params = {}
        params['actor'] = self.actor.state_dict()
        params['critic'] = self.critic.state_dict()
        torch.save(params, path)

    def load_weights(self,path):
        checkpoint = torch.load(path, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor'])
        self.actor_target.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.critic_target.load_state_dict(checkpoint['critic'])

    def reset_episode(self):
        self.noise.reset()

    def ddpg_distance_metric(actions1,actions2):
        """
        Computes distance between actions taken by two different policies
        Expects numpy arrays
        """
        diff = actions1-actions2
        mean_diff = np.mean(np.square(diff),axis=0)
        dist = sqrt(np.mean(mean_diff))
        return dist

    def act(self, state):
        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()
        action += self.noise.sample() * self.noise_scale
        self.actor.train()
        return np.clip(action, self.action_low, self.action_high)

    def act_perturbed(self,state):
        with torch.no_grad():
            action = self.actor_perturbed(self.tensor(state)).cpu().numpy()
        return action

    def perturbed_update(self):
        hard_update(self.actor,self.actor_perturbed)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape).to(self.device)
            param += random * self.param_noise.current_stddev
            

    def evaluate(self,state):
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()
        return action

    def step(self, obs, actions, rewards, next_obs, dones):
        # Step noise
        self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01)
        # cast as torch tensors
        next_obs = torch.from_numpy(next_obs).float().to(self.device)
        obs = torch.from_numpy(obs).float().to(self.device)
        actions = torch.from_numpy(actions).float().to(self.device)
        # Calc TD error
        next_action = self.actor(next_obs)
        next_value = self.critic_target(next_obs,next_action)
        target = rewards + self.gamma * next_value * dones
        local = self.critic(obs,actions)
        TD_error = (target - local).squeeze(0)
        self.PER.add(obs, actions, rewards, next_obs, dones, TD_error)
        for _ in range(self.SGD_epoch):
            samples,indicies,importances = self.PER.sample()
            self.learn(samples,indicies,importances)

    def add_replay_warmup(self,obs,actions,rewards,next_obs,dones):
        next_obs = torch.from_numpy(next_obs).float().to(self.device)
        obs = torch.from_numpy(obs).float().to(self.device)
        actions = torch.from_numpy(actions).float().to(self.device)
#         dones = torch.from_numpy(dones).float().to(self.device)
#         rewards = torch.from_numpy(rewards).float().to(self.device)
        # Calculate TD_error
        next_action = self.actor(next_obs)
        next_value = self.critic_target(next_obs,next_action)
        target = rewards + self.gamma * next_value * dones
        local = self.critic(obs,actions)
        TD_error = (target - local).squeeze(0)
        self.PER.add(obs,actions,np.max(rewards),next_obs,np.max(dones),TD_error)

    def learn(self,samples,indicies,importances):
        
        states, actions, rewards, next_states, dones = samples

        with torch.no_grad():
              target_actions = self.actor_target(next_states)
        next_values = self.critic_target(next_states,target_actions)
        y_target = rewards + self.gamma * next_values * (1-dones)
        y_current = self.critic(states, actions)
        TD_error = y_current - y_target
        # update critic
        critic_loss = ((torch.tensor(importances).to(self.device)*TD_error)**2).mean()
        self.critic.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(),self.clip_norm)
        self.critic_opt.step()

        # update actor
        local_actions = self.actor(states)
        actor_loss = -self.critic(states, local_actions).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(),self.clip_norm)
        self.actor_opt.step()

        # Update PER
        TD_errors = TD_error.squeeze(1).detach().cpu().numpy()
        self.PER.sum_tree.update_priorities(TD_errors,indicies)

        # soft update networks
        self.soft_update()

    def soft_update(self):
        """Soft update of target network
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(self.tau*param.data+(1-self.tau)*target_param.data)
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(self.tau*param.data+(1-self.tau)*target_param.data)

    def tensor(self, x):
        return torch.from_numpy(x).float().to(self.device)


# Train

In [195]:
def seed_replay_buffer(agent,env,min_buffer_size):
    state = env.reset()
    while len(agent.PER) < min_buffer_size:
        # Random actions between 1 and -1
        action = ((np.random.rand(1)*2)-1)
        next_state,reward,done,_ = env.step(action)
        # reshape
        agent.add_replay_warmup(state,action,reward,next_state,done)
        # Store experience
        if done:
            state = env.reset()
        state = next_state
    print('finished replay warm up')

In [196]:
def train(agent,env,epsilon=1,noise_decay=70,n_episodes=100, tmax=200):
    """Deep Deterministic Policy Gradients.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        Instead of updating target every (int) steps, using 'soft' updating of .1 to gradually merge the networks
        Index: current index for replacing memories in the priority replay buffer
    """
    scores = []
    scores_window = deque(maxlen=100)
    N = OUNoise(agent.nA,123)
    for e in range(1,n_episodes):
        state = env.reset()
        score = 0
        for t in range(1,tmax):
            action = agent.act(state)
            next_state,reward,done,_ = env.step(action)
            # store memory and learn
            agent.step(state,action,reward,next_state,done)
            # Bookkeeping
            state = next_state
            score += reward
        scores.append(score)
        scores_window.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_window)),end="")
        if e % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_window)))
        if np.mean(scores_window) == 100.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(e, np.mean(scores_window)))
#             torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

# Main and params

In [197]:
BUFFER_SIZE = 10000
MIN_BUFFER_SIZE = 200
BATCH_SIZE = 50
ALPHA = 0.6 # 0.7 or 0.6
START_BETA = 0.5 # from 0.5-1
END_BETA = 1
QLR = 0.001
ALR = 0.0001
EPSILON = 1
MIN_EPSILON = 0.01
GAMMA = 0.99
TAU = 0.001
L2 = 0.01
UPDATE_EVERY = 4
CLIP_NORM = 10
V_MAX = 100
V_MIN = -100
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
def main():
    seed = 7
    config = Config('d4pg')
    env = gym.make('Pendulum-v0')
    env.seed(seed)
    nA = env.action_space.shape[0]
    nS = env.observation_space.shape[0]
    print('Observation Space {}, Action Space {}'.format(nS,nA))
    agent = D4PG(nS,nA,config)
    seed_replay_buffer(agent,env,config.min_buffer_size)
    scores = train(agent,env)
    return scores

In [198]:
scores = main()
plt.plot(scores)

Observation Space 3, Action Space 1
Priority Tree: Batch Size 256 Buffer size 10000 Number of intermediate Nodes 40
finished replay warm up
Episode 44	Average Score: -1170.34

KeyboardInterrupt: 