Necessary imports.

In [1]:
from collections import deque
from task import Task
from keras import layers, models, optimizers
from keras import backend
import numpy as np
import copy
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Define the Experience Replay Memory buffer:

In [2]:
class ExperienceReplayBuffer:
    def __init__(self, capacity, batch_size):
        self.batch_size = batch_size
        self.mem = deque(maxlen=capacity)
        
    def add_env_reaction(self, env_reaction):
        # St, At, Rt1, Dt, St1.
        self.mem.append(env_reaction)
    
    def sample_batch(self):
        indexes = np.random.choice(a=np.arange(len(self.mem)), size=batch_size, replace=False)
        states = list()
        actions = list()
        rewards = list()
        dones = list()
        next_states = list()
        for index in indexes:
            st, at, rt, dt, st_1 = self.mem[index]
            states.append(st)
            actions.append(at)
            rewards.append(rt)
            dones.append(dt)
            next_states.append(st_1)      
        return np.array(states), np.array(actions), np.array(rewards), np.array(dones), np.array(next_states)


### Actor:
    Define NN for policy approximation and specify loss, backprop with action gradients dL/dA from Critc.

In [3]:
class Actor:
    def __init__(self, state_space, action_space, action_range, action_min, hidden_units, name):
        self.state_space = state_space
        self.action_space = action_space
        self.action_range = action_range
        self.action_min = action_min
        self.name = name
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Input(shape=(self.state_space,), dtype=np.float32, name='input_states')
        fc1 = layers.Dense(units=hidden_units, activation='relu', name='fc1')(input_states)
        fc2 = layers.Dense(units=2*hidden_units, activation='relu', name='fc2')(fc1)
        fc3 = layers.Dense(units=hidden_units, activation='relu', name='fc3')(fc2)
        norm_action = layers.Dense(self.action_space, activation='sigmoid', name='norm_action')(fc3)
        
        # Adapt actions for the range in which rotors work.
        actions = layers.Lambda(lambda x: x*self.action_range + action_min, name='actions')(norm_action)
        self.actor_model = models.Model(input=[input_states], output=[actions])
        
        # Define Loss
        input_act_grad = layers.Input(shape=(self.action_space,), dtype=np.float32, name='input_act_grad')
        loss = backend.mean(-input_act_grad*actions)
        
        # Get trainable parameters and define backprop optimization.
        adam_optimizer = optimizers.Adam()
        train_param = adam_optimizer.get_updates(params=self.actor_model.trainable_weights, loss=loss)
        # keras.backend.learning_phase() gives a flag to be passed as input
        # to any Keras function that uses a different behavior at train time and test time.
        self.train_nn = backend.function(inputs=[input_states, input_act_grad, backend.learning_phase()],\
                                         outputs=[], updates=train_param)
        

### Critic:
    Define NN for Action value approximation and specify action gradients dL/dA to pass to Actor.

In [4]:
class Critic:
    def __init__(self, state_space, action_space, hidden_units):
        self.state_space = state_space
        self.action_space = action_space
        self.hidden_units = hidden_units
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Input(shape=(self.state_space,), dtype=np.float32, name='input_states')
        fc_states1 = layers.Dense(units=hidden_units, activation='relu')(input_states)
        fc_states2 = layers.Dense(units=2*hidden_units, activation='relu')(fc_states1)
        
        input_actions = layers.Input(shape=(self.action_space,), dtype=np.float32, name='input_actions')
        fc_actions1 = layers.Dense(units=hidden_units, activation='relu')(input_actions)
        fc_actions2 = layers.Dense(units=2*hidden_units, activation='relu')(fc_actions1)
        
        # Advantage function.
        fc_sa1 = layers.Add()([fc_states2, fc_actions2])
        fc_sa2 = layers.Activation('relu')(fc_sa1)
        
        q_values = layers.Dense(units=1, activation='relu', name='q_values')(fc_sa2)
        self.critic_model = models.Model(inputs=[input_states, input_actions], outputs=[q_values])
        
        # Optimizer and Loss.
        adam_optimizer = optimizers.Adam()
        self.critic_model.compile(loss='mean_squared_error', optimizer=adam_optimizer)
        
        # Define function to get action gradients.
        action_gradients = backend.gradients(loss=q_values, variables=[input_actions])
        self.get_action_gradients = backend.function(inputs=[input_states, input_actions, backend.learning_phase()], \
                                                    outputs=action_gradients)

### Ornstein–Uhlenbeck process definition for exploration:

In [5]:
class OUNoise:
    def __init__(self, action_space, mean, sigma, theta):
        self.mean = mean*np.ones(action_space)
        self.sigma = sigma
        self.theta = theta
        self.restart()
        
    def restart(self):
        self.current = copy.copy(self.mean)
        
    def sample(self):
        x = self.current
        dx = self.theta*(self.mean-x) + self.sigma*np.random.randn(len(x))
        self.current = x+dx
        return x+dx

### Deep Deterministic Policy Gradient, DDPG Agent:
    Agent definition following DDPG

In [43]:
class DDPG_Agent:
    def __init__(self, task, noise, memory, rl_param, nn_hidden):
        self.task = task
        self.action_low = self.task.action_low
        self.action_high = self.task.action_high
        self.state_space = self.task.state_size
        self.action_space = self.task.action_size
        
        # Instantiate Actors and Critics.
        self.actor = Actor(self.state_space, self.action_space, self.action_high-self.action_low, self.action_low,\
                          hidden_units=nn_hidden[0], name='actor')
        self.actor_target = Actor(self.state_space, self.action_space, self.action_high-self.action_low, \
                                  self.action_low, hidden_units=nn_hidden[1], name='actor_target')        
        self.critic = Critic(self.state_space, self.action_space, hidden_units=32)
        self.critic_target = Critic(self.state_space, self.action_space, hidden_units=32)
        
        # Set same weights in target.
        self.actor_target.actor_model.set_weights(self.actor.actor_model.get_weights())
        self.critic_target.critic_model.set_weights(self.critic.critic_model.get_weights())
        
        # Noise for exploration.
        self.mean = noise[0]
        self.sigma = noise[1]
        self.theta = noise[2]
        self.ounoise = OUNoise(self.action_space, self.mean, self.sigma, self.theta)
        
        # Experience Replay memory.
        self.capacity = memory[0]
        self.batch_size = memory[1]
        self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity, batch_size=self.batch_size)
        
        # RL parameters.
        self.gamma = rl_param[0]
        self.t = rl_param[1]
        
        # Keeping track of learning.
        self.learning_rewards = list()
        self.total_reward = None
        self.best_reward = -np.inf
        
    def restart_task(self):
        if self.total_reward is not None:
            self.learning_rewards.append(self.total_reward)
            if self.total_reward > self.best_reward: self.best_reward = self.total_reward
        self.total_reward = 0
        self.state = self.task.reset()
        self.ounoise.restart()
        return self.state
        
    def act(self, state):
        action = self.actor.actor_model.predict(np.reshape(state, newshape=(-1, self.state_space)))
        self.step_noise = self.ounoise.sample()
        action = action + self.step_noise
        return action[0]
        
    # Saves expirience into memory and updates actor-critic weights.
    def store_learn(self, state, action, reward, done, next_state):
        
        # Store experience into exp replay memory.
        self.er_buffer.add_env_reaction((state, action, reward, done, next_state))
        
        # Learn if agent has enough experiences.
        if len(self.er_buffer.mem) > self.batch_size:
            self.learn()
        
        self.total_reward += reward
        # Update to the current state of the enviroment.
        self.state = next_state
     
    def soft_update(self):
        actor_current = np.array(self.actor.actor_model.get_weights())
        critic_current = np.array(self.critic.critic_model.get_weights())
        actor_target = np.array(self.actor_target.actor_model.get_weights())
        critic_target = np.array(self.critic_target.critic_model.get_weights())
        
        self.actor_target.actor_model.set_weights(actor_target*(1-self.t) + self.t*actor_current)
        self.critic_target.critic_model.set_weights(critic_target*(1-self.t) + self.t*critic_current)
    
    # Learn step of the agent, update weights of actor-critic and actor-critic target NN.
    def learn(self):
        states, actions, rewards, dones, next_states = self.er_buffer.sample_batch()
        
        # Get action for deterministic policy.
        next_actions = self.actor_target.actor_model.predict_on_batch(next_states)
        next_q_values = self.critic_target.critic_model.predict_on_batch([next_states, next_actions])
        next_q_values = next_q_values.reshape((self.batch_size,))
        
        # Need to handle the done case.
        targets = rewards + self.gamma*next_q_values*(1-dones)
        self.critic.critic_model.train_on_batch(x=[states, actions],y=[targets])
        
        
        # Learning Phase = 0 (Test), we just want the gradient, no update on weights.
        action_gradients = self.critic.get_action_gradients([states, actions, 0])
        self.actor.train_nn([states, action_gradients[0], 1])
        
        
        # Do soft update on weigths.
        self.soft_update()
        
        

Function to track progress per episode:

In [44]:
import csv

def track_quad(task, labels, action):
    results = {x : [] for x in labels}
    line = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(action)
    for ii in range(len(labels)):
            results[labels[ii]].append(line[ii])
    return line


### Run agent on the enviroment:

In [45]:
# NN sizes
actor_hidden = 32
critic_hidden = 32
nn_hidden = [actor_hidden, critic_hidden]

# Noise for exploration.
mean = 0
sigma = 0.15
theta = 0.2
noise = [mean, sigma, theta]

# RL parameters.
gamma = 0.99
t = 0.01
rl_param = [gamma, t]

# Experience Replay memory.
capacity = 100000
batch_size = 2
memory = [capacity, batch_size]

# Task parameters and instance.
runtime = 10.                                     # time limit of the episode
init_pose = np.array([0., 0., 10., 0., 0., 0.])   # initial pose
init_velocities = np.array([0., 0., 0.])          # initial velocities
init_angle_velocities = np.array([0., 0., 0.])    # initial angle velocities
target_pos = np.array([0., 0., 10.])              # target position
task = Task(init_pose, init_velocities, init_angle_velocities, runtime, target_pos)

# Labels for reporting
labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', \
          'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', \
          'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4']

# Pending items.
# 4. Need to add batch norm to function approximation NN.
quadcopter_agent = DDPG_Agent(task, noise, memory, rl_param, nn_hidden)



In [46]:
num_episodes = 1000

for episode in range(1, num_episodes+1):
    # Run the simulation, and save the results.
    file_output = 'data/data_%s.txt' % episode
    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)
        
        state = quadcopter_agent.restart_task()
        done = False
        while not done:
            
            action = quadcopter_agent.act(state)
            next_state, reward, done = task.step(action)
            quadcopter_agent.store_learn(state, action, reward, done, next_state)
            
            # Keep track of position
            line = track_quad(task, labels, action)
            writer.writerow(line)
            
        print("Episode = {:4d}, score = {:7.3f} (best = {:7.3f}), noise_scale = {}".format(
                episode, quadcopter_agent.total_reward, quadcopter_agent.best_reward, \
                quadcopter_agent.step_noise))

Episode =    1, score =  -6.707 (best =    -inf), noise_scale = [ 0.57218848 -0.30071559 -0.25493322  0.20607662]
Episode =    2, score = -40.497 (best =  -6.707), noise_scale = [-0.03058389 -0.15538878  0.52280688 -0.17516529]
Episode =    3, score = -41.656 (best =  -6.707), noise_scale = [ 0.179579   -0.04311952 -0.07087218  0.28400132]
Episode =    4, score =  -3.401 (best =  -6.707), noise_scale = [0.19376464 0.16646904 0.11506206 0.17138674]
Episode =    5, score = -32.267 (best =  -3.401), noise_scale = [ 0.24125853 -0.05042658  0.28779595 -0.26054911]
Episode =    6, score = -24748.153 (best =  -3.401), noise_scale = [-0.05886992 -0.1783156  -0.01012092  0.51289482]
Episode =    7, score = -6484.105 (best =  -3.401), noise_scale = [ 0.24843546 -0.5075559   0.17390145 -0.24565065]
Episode =    8, score = -7496.133 (best =  -3.401), noise_scale = [ 0.22349342 -0.44664893 -0.04033971  0.02941326]
Episode =    9, score = -10102.881 (best =  -3.401), noise_scale = [ 0.05569914  0.09

Episode =   72, score = -266.584 (best =  -3.401), noise_scale = [ 0.08363624 -0.41039537 -0.1204606   0.10975946]
Episode =   73, score = -263.375 (best =  -3.401), noise_scale = [ 0.13131514  0.13140364  0.21548108 -0.11508562]
Episode =   74, score = -266.141 (best =  -3.401), noise_scale = [0.42089741 0.15292017 0.13855263 0.00239757]
Episode =   75, score = -265.201 (best =  -3.401), noise_scale = [-0.24811593 -0.04060153 -0.16490896  0.3215142 ]
Episode =   76, score = -266.572 (best =  -3.401), noise_scale = [-0.17338343 -0.19502063  0.2149294  -0.10113775]
Episode =   77, score = -134.023 (best =  -3.401), noise_scale = [-0.2976796  -0.13236891  0.25698141  0.07675824]
Episode =   78, score = -48.239 (best =  -3.401), noise_scale = [-0.00817678  0.16299522  0.19309731 -0.06619984]
Episode =   79, score = -46.826 (best =  -3.401), noise_scale = [-0.27358113 -0.09790952 -0.02449017 -0.09874566]
Episode =   80, score = -46.721 (best =  -3.401), noise_scale = [ 0.20847308 -0.438495

Episode =  145, score = -45.823 (best =  -3.401), noise_scale = [-0.08138716 -0.0795359  -0.1288471  -0.01950319]
Episode =  146, score = -46.599 (best =  -3.401), noise_scale = [ 0.24778913 -0.21486705 -0.15033509 -0.36732246]
Episode =  147, score = -46.877 (best =  -3.401), noise_scale = [-0.17788745  0.03844577 -0.34436892 -0.11947922]
Episode =  148, score = -46.507 (best =  -3.401), noise_scale = [-0.45484037 -0.33038769  0.19064503 -0.22650288]
Episode =  149, score = -46.340 (best =  -3.401), noise_scale = [-0.12313762 -0.22322866  0.04612507  0.03362473]
Episode =  150, score = -46.542 (best =  -3.401), noise_scale = [-0.10517335 -0.13404747  0.05422315  0.01183274]
Episode =  151, score = -46.436 (best =  -3.401), noise_scale = [-0.00071024 -0.03473946  0.5469951   0.01037256]
Episode =  152, score = -46.118 (best =  -3.401), noise_scale = [ 0.23112859 -0.28848949  0.40072833  0.38640325]
Episode =  153, score = -46.406 (best =  -3.401), noise_scale = [ 0.23498482 -0.00253008

Episode =  217, score = -46.782 (best =  -3.401), noise_scale = [-0.30363194 -0.17108686  0.00996922 -0.04863757]
Episode =  218, score = -46.935 (best =  -3.401), noise_scale = [ 0.25763898  0.0873834  -0.47063058  0.42785264]
Episode =  219, score = -46.593 (best =  -3.401), noise_scale = [ 0.18567149  0.13908986  0.04803966 -0.19139078]
Episode =  220, score = -46.085 (best =  -3.401), noise_scale = [-0.37606569  0.1487139  -0.33136476 -0.03260383]
Episode =  221, score = -46.154 (best =  -3.401), noise_scale = [-0.02646895  0.16523486 -0.32823413 -0.00462183]
Episode =  222, score = -45.875 (best =  -3.401), noise_scale = [0.02644709 0.2961712  0.27368558 0.21555608]
Episode =  223, score = -46.951 (best =  -3.401), noise_scale = [-0.19137493 -0.26686747  0.05287605  0.22064256]
Episode =  224, score = -46.268 (best =  -3.401), noise_scale = [ 0.01605157  0.18378045  0.00814203 -0.07125185]
Episode =  225, score = -46.564 (best =  -3.401), noise_scale = [ 0.07163807  0.17204893 -0.

Episode =  290, score = -46.890 (best =  -3.401), noise_scale = [ 0.19294639 -0.05345725 -0.05556429  0.17028758]
Episode =  291, score = -46.851 (best =  -3.401), noise_scale = [-0.17440249 -0.51702179 -0.00440398 -0.14566506]
Episode =  292, score = -46.460 (best =  -3.401), noise_scale = [ 0.06473419  0.10980141 -0.08804646  0.46468486]
Episode =  293, score = -46.176 (best =  -3.401), noise_scale = [-0.22437943 -0.29053847 -0.07648037  0.08222446]
Episode =  294, score = -46.296 (best =  -3.401), noise_scale = [ 0.51019276  0.42323724 -0.11154981 -0.42565468]
Episode =  295, score = -46.079 (best =  -3.401), noise_scale = [-0.21170308  0.14768345  0.45274286  0.3502701 ]
Episode =  296, score = -46.409 (best =  -3.401), noise_scale = [ 0.43397925 -0.28932996 -0.22019931 -0.52176777]
Episode =  297, score = -46.022 (best =  -3.401), noise_scale = [-0.26118784 -0.18609108  0.12741422 -0.34759185]
Episode =  298, score = -45.776 (best =  -3.401), noise_scale = [-0.10742422  0.17810101

Episode =  362, score = -47.272 (best =  -3.401), noise_scale = [ 0.1094988   0.30847525 -0.03662165 -0.16466733]
Episode =  363, score = -46.561 (best =  -3.401), noise_scale = [-0.21810667  0.33944186  0.21666406  0.21177724]
Episode =  364, score = -46.835 (best =  -3.401), noise_scale = [ 0.67657873 -0.22415647 -0.41257017 -0.25534668]
Episode =  365, score = -46.373 (best =  -3.401), noise_scale = [-0.29319292 -0.08658372  0.1815119  -0.08112261]
Episode =  366, score = -46.888 (best =  -3.401), noise_scale = [ 0.14545717  0.19162973 -0.37360617  0.29288572]
Episode =  367, score = -46.993 (best =  -3.401), noise_scale = [-0.03165772 -0.03372026  0.16345306 -0.27273996]
Episode =  368, score = -46.525 (best =  -3.401), noise_scale = [ 0.06572101  0.451526    0.36758888 -0.01240844]
Episode =  369, score = -46.771 (best =  -3.401), noise_scale = [-0.0055402   0.17010238  0.04321485  0.23388787]
Episode =  370, score = -47.321 (best =  -3.401), noise_scale = [-0.42873989 -0.24960672

Episode =  434, score = -46.494 (best =  -3.401), noise_scale = [-0.21569048 -0.21050136  0.07612448  0.05309118]
Episode =  435, score = -46.985 (best =  -3.401), noise_scale = [-0.00279716 -0.03887086  0.02026527 -0.29716706]
Episode =  436, score = -46.430 (best =  -3.401), noise_scale = [-0.22525555  0.42436424 -0.05355555 -0.10103749]
Episode =  437, score = -46.418 (best =  -3.401), noise_scale = [ 0.12042419  0.23539998 -0.12892263  0.26127616]
Episode =  438, score = -47.347 (best =  -3.401), noise_scale = [ 0.11224245  0.09795505 -0.01531577 -0.09330489]
Episode =  439, score = -46.013 (best =  -3.401), noise_scale = [-0.20375855  0.14179821 -0.25170849 -0.37308922]
Episode =  440, score = -46.622 (best =  -3.401), noise_scale = [0.49281895 0.46250238 0.14541834 0.19169343]
Episode =  441, score = -46.510 (best =  -3.401), noise_scale = [ 0.39988703 -0.01254687 -0.34517383 -0.12689056]
Episode =  442, score = -45.660 (best =  -3.401), noise_scale = [-0.28267624  0.09030594 -0.

Episode =  507, score = -46.916 (best =  -3.401), noise_scale = [-0.25521525 -0.31502734 -0.10032422 -0.0969412 ]
Episode =  508, score = -47.195 (best =  -3.401), noise_scale = [-0.03577635 -0.16264973 -0.14110845  0.02320233]
Episode =  509, score = -45.661 (best =  -3.401), noise_scale = [-0.11008551 -0.09254855  0.05855043  0.23154286]
Episode =  510, score = -46.264 (best =  -3.401), noise_scale = [ 0.05480788 -0.07934011  0.2710423  -0.12760646]
Episode =  511, score = -46.003 (best =  -3.401), noise_scale = [ 0.04782667 -0.23366773  0.17237565  0.07326109]
Episode =  512, score = -45.445 (best =  -3.401), noise_scale = [0.06848313 0.14823377 0.01208674 0.13161521]
Episode =  513, score = -46.673 (best =  -3.401), noise_scale = [-0.34972092 -0.08717772 -0.39223485  0.22836469]
Episode =  514, score = -46.208 (best =  -3.401), noise_scale = [-0.1987886   0.21116778  0.1732436   0.18047852]
Episode =  515, score = -46.590 (best =  -3.401), noise_scale = [-0.19382874 -0.14324873 -0.

Episode =  580, score = -47.203 (best =  -3.401), noise_scale = [-0.14282795  0.24167321 -0.17972514 -0.35294156]
Episode =  581, score = -46.769 (best =  -3.401), noise_scale = [-0.14924552  0.15260205  0.17201536 -0.26675451]
Episode =  582, score = -46.496 (best =  -3.401), noise_scale = [-0.05293026 -0.18545552  0.22849991 -0.09194553]
Episode =  583, score = -46.787 (best =  -3.401), noise_scale = [-0.11028737  0.10927786  0.35107585  0.03767448]
Episode =  584, score = -47.169 (best =  -3.401), noise_scale = [-0.08098884  0.22795766 -0.07757924  0.09161984]
Episode =  585, score = -46.473 (best =  -3.401), noise_scale = [0.18409511 0.03808227 0.07804847 0.13776134]
Episode =  586, score = -47.149 (best =  -3.401), noise_scale = [-0.25620932  0.16840353 -0.05764615 -0.0079603 ]
Episode =  587, score = -46.989 (best =  -3.401), noise_scale = [-0.02874747  0.32745415 -0.24144655 -0.21600463]
Episode =  588, score = -47.030 (best =  -3.401), noise_scale = [-0.09130738 -0.2868248  -0.

Episode =  653, score = -46.669 (best =  -3.401), noise_scale = [-0.04263287 -0.45935644 -0.11503287 -0.09026119]
Episode =  654, score = -45.583 (best =  -3.401), noise_scale = [ 0.06573119  0.01044049 -0.32237448  0.11574766]
Episode =  655, score = -47.002 (best =  -3.401), noise_scale = [ 0.17381753 -0.0515457   0.08631973 -0.0345759 ]
Episode =  656, score = -46.590 (best =  -3.401), noise_scale = [-0.70443363  0.02297602 -0.04533078 -0.03641372]
Episode =  657, score = -46.273 (best =  -3.401), noise_scale = [ 0.43659072 -0.16087181 -0.11307904  0.1640398 ]
Episode =  658, score = -46.144 (best =  -3.401), noise_scale = [-0.3022385   0.02609838  0.12505658  0.12802466]
Episode =  659, score = -45.797 (best =  -3.401), noise_scale = [ 0.17403443 -0.25942556  0.33536629 -0.18653629]
Episode =  660, score = -46.255 (best =  -3.401), noise_scale = [ 0.06237331 -0.03177903 -0.0863529   0.20406581]
Episode =  661, score = -46.983 (best =  -3.401), noise_scale = [ 0.36788191 -0.24764514

Episode =  726, score = -45.674 (best =  -3.401), noise_scale = [-0.23442546  0.19801705 -0.21090743  0.09834547]
Episode =  727, score = -46.176 (best =  -3.401), noise_scale = [ 0.3795115   0.1205032   0.32633976 -0.11327641]
Episode =  728, score = -46.061 (best =  -3.401), noise_scale = [-0.17336438  0.07303566 -0.10144428  0.5633651 ]
Episode =  729, score = -46.451 (best =  -3.401), noise_scale = [-0.32427015 -0.51878727 -0.02415215 -0.46280437]
Episode =  730, score = -46.459 (best =  -3.401), noise_scale = [-0.14793468 -0.04071943  0.01967661 -0.15920907]
Episode =  731, score = -46.340 (best =  -3.401), noise_scale = [-0.30344236 -0.11513239  0.31519867 -0.13895053]
Episode =  732, score = -47.298 (best =  -3.401), noise_scale = [-0.01558715 -0.09153871 -0.18815082  0.08508518]
Episode =  733, score = -46.195 (best =  -3.401), noise_scale = [-0.27861884 -0.17497323 -0.35824136  0.60840678]
Episode =  734, score = -46.045 (best =  -3.401), noise_scale = [-0.26567256 -0.23594748

Episode =  799, score = -47.364 (best =  -3.401), noise_scale = [-0.37168707 -0.27437318  0.1719992  -0.04691887]
Episode =  800, score = -46.235 (best =  -3.401), noise_scale = [ 0.08541664  0.09867923  0.23922981 -0.07383126]
Episode =  801, score = -47.133 (best =  -3.401), noise_scale = [-0.12048961 -0.16336711 -0.02024692  0.30056619]
Episode =  802, score = -46.421 (best =  -3.401), noise_scale = [ 0.05402137  0.13857769 -0.08242957  0.17390846]
Episode =  803, score = -45.405 (best =  -3.401), noise_scale = [ 0.19611886  0.60917566 -0.18162572  0.31751086]
Episode =  804, score = -46.837 (best =  -3.401), noise_scale = [ 0.18462627  0.14145363  0.13296026 -0.10605493]
Episode =  805, score = -46.548 (best =  -3.401), noise_scale = [ 0.12247347  0.08560132 -0.21965163 -0.09000458]
Episode =  806, score = -46.218 (best =  -3.401), noise_scale = [-0.08847531  0.10952081 -0.02901639  0.18937272]
Episode =  807, score = -46.216 (best =  -3.401), noise_scale = [ 0.29523869 -0.20314313

Episode =  871, score = -46.935 (best =  -3.401), noise_scale = [-0.29103167 -0.32551596 -0.03964336 -0.35072447]
Episode =  872, score = -47.164 (best =  -3.401), noise_scale = [ 0.21371202 -0.02554482  0.21334709  0.32861659]
Episode =  873, score = -46.469 (best =  -3.401), noise_scale = [-0.11868922  0.06845814 -0.23957989  0.28347642]
Episode =  874, score = -47.556 (best =  -3.401), noise_scale = [ 0.01853665 -0.25812728 -0.20464163  0.14526669]
Episode =  875, score = -46.454 (best =  -3.401), noise_scale = [-0.2916837   0.05791072 -0.12386212 -0.13405895]
Episode =  876, score = -46.620 (best =  -3.401), noise_scale = [ 0.0238649   0.2505783   0.0718941  -0.07984021]
Episode =  877, score = -46.615 (best =  -3.401), noise_scale = [ 0.08703825  0.50933819 -0.28840849  0.09975157]
Episode =  878, score = -45.966 (best =  -3.401), noise_scale = [0.106337   0.23687976 0.14880691 0.33540422]
Episode =  879, score = -46.800 (best =  -3.401), noise_scale = [-0.05199452  0.05705788  0.

Episode =  944, score = -46.343 (best =  -3.401), noise_scale = [ 0.13192624 -0.66090939  0.21713629 -0.07788855]
Episode =  945, score = -47.205 (best =  -3.401), noise_scale = [ 0.10665169 -0.08435981 -0.05621648 -0.04643042]
Episode =  946, score = -47.058 (best =  -3.401), noise_scale = [-0.06339874  0.08986498  0.05773845 -0.01102614]
Episode =  947, score = -47.623 (best =  -3.401), noise_scale = [-0.10242254 -0.01233696  0.01768375  0.01736207]
Episode =  948, score = -46.740 (best =  -3.401), noise_scale = [0.64046547 0.21875605 0.14788341 0.04912434]
Episode =  949, score = -46.773 (best =  -3.401), noise_scale = [-0.25497236  0.19159703 -0.56345405  0.0478023 ]
Episode =  950, score = -47.305 (best =  -3.401), noise_scale = [ 0.03445324  0.22943812 -0.21753749 -0.1835514 ]
Episode =  951, score = -46.798 (best =  -3.401), noise_scale = [ 0.16830526  0.13557951 -0.01922925 -0.06955554]
Episode =  952, score = -45.779 (best =  -3.401), noise_scale = [-0.08419057  0.38642332  0.