In [None]:
#=======================================#
# Yes, this notebook is over-commented. #
#=======================================#

In [1]:
# Make notebook span entire screen, horizontally.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import gym
import numpy as np
import tensorflow as tf

In [13]:
#===================#
# Utility functions #
#===================#

def process_rewards(rewards, decay, norm=True):
        discounted = np.zeros_like(rewards)
        running_reward = 0
        
        for idx in reversed(range(len(rewards))):
            running_reward += rewards[idx]
            running_reward *= decay
            discounted[idx] = running_reward
            
        if norm:
            discounted -= np.mean(discounted)
            if np.std(discounted) != 0:
                discounted /= np.std(discounted)

        return discounted.tolist()

In [152]:
class PolicyAgent(object):
    def __init__(self, sess):
        self.num_actions = 4
        self._build()
        self.sess = sess
        
    def _build(self):
        self.actions      = tf.placeholder(tf.int32, (None, 1))
        self.columns      = tf.placeholder(tf.int32, (None, 1))
        self.e_encr       = tf.placeholder(tf.float32)
        self.e_dscr       = tf.placeholder(tf.float32)
        self.l_rate       = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        self.training     = tf.placeholder(tf.bool)
        
        with tf.variable_scope('actor-hidden'):
            h1    = tf.layers.dense(self.observations, 128, 
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h1')
            
            drop1 = tf.layers.dropout(h1, training=self.training, name='drop1')
            
            h2    = tf.layers.dense(drop1, 64,
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h2')
            
            drop2 = tf.layers.dropout(h2, training=self.training, name='drop2')
            
            h3    = tf.layers.dense(drop2, 64,
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h3')
            
            drop3 = tf.layers.dropout(h3, training=self.training, name='dropout')
        
            self.out = tf.layers.dense(drop3, self.num_actions,
                                       kernel_initializer=tf.random_normal_initializer(), 
                                       kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=0.1),
                                       name='out')
        
        # Compute probabilities associated with each action.
        self.probabilities = tf.clip_by_value(tf.nn.softmax(self.out), 1e-10, 1.0)
        
        # Compute entropy based on action probabilities.
        self.entropy = -tf.reduce_sum(self.probabilities * tf.log(self.probabilities), 1, name="entropy")
        
        # Compute losses of action probabilities associated with each observation in a single batch.
        indices = tf.concat(values=[self.columns, self.actions], axis=1)
        self.picked_action_prob = tf.gather_nd(self.probabilities, indices)
        min_max_punishment = tf.reduce_max(self.probabilities,axis=1) - tf.reduce_min(self.probabilities,axis=1)
        min_max_weight = tf.reduce_max(self.out, axis=1) - tf.reduce_min(self.out, axis=1)
        self.losses = -tf.log(self.picked_action_prob) * self.target - self.entropy * self.e_encr \
                        + min_max_punishment * min_max_weight*10
        #self.losses = -tf.log(self.picked_action_prob) * self.target + tf.losses.get_regularization_loss()
        
        # Compute batch loss.
        self.loss = tf.reduce_mean(self.losses)
        
        # Set optimizer.
        self.train_op = tf.train.RMSPropOptimizer(self.l_rate).minimize(self.loss)
    
    # NOTE: computing `out` from `self.out` is not necessary -- just for debugging
    def choose_action(self, obs, verbose=False):
        # Compute probabilities associated with each action and output layer node values.
        out, probs, ent = self.sess.run([self.out, self.probabilities, self.entropy], feed_dict={
            self.observations: np.array(obs).reshape(-1, 8),
            self.training:     False
        })
        
        if verbose: print(probs, out, ent)
            
        # Choose action based on computed probabilities.
        return np.random.choice(range(probs.shape[1]), p=probs.ravel())
    
    def train(self, act, obs, target, l_rate, e_encr, e_dscr):
        length = np.array(act).reshape(-1, 1).shape[0]
        
        inp = (self.train_op, self.loss, self.probabilities, self.entropy, self.out)
        
        _, *results = self.sess.run(inp, feed_dict={
            self.actions:      np.array(act).reshape(-1, 1),
            self.columns:      np.arange(length).reshape(-1, 1),
            self.e_encr:       e_encr,
            self.e_dscr:       e_dscr,
            self.l_rate:       l_rate,
            self.observations: np.array(obs).reshape(-1, 8),
            self.target:       np.array(target).reshape(-1, 1),
            self.training:     True
        })
        
        # print('-' * 32)
        # print('\n'.join(results))
        # print('-' * 32)

        return results[0]


In [153]:
class Critic(object):
    def __init__(self, sess):
        self.sess = sess
        self._build()
        
    def _build(self):
        self.l_rate       = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        self.training     = tf.placeholder(tf.bool)
        
        with tf.variable_scope('critic-hidden'):
            h1    = tf.layers.dense(self.observations, 128,
                                    activation=tf.nn.relu,
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name='h1')
            
            drop1 = tf.layers.dropout(h1, name='drop1', training=self.training)
            
            h2    = tf.layers.dense(drop1, 64,
                                    activation=tf.nn.relu,
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name='h2')
            
            drop2 = tf.layers.dropout(h2, name='drop2', training=self.training)
            
            out   = tf.layers.dense(drop2, 1,
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name='out')
            
        self.value_estimate = tf.squeeze(out) # [[num]] -> num
        self.losses = tf.squared_difference(self.value_estimate, self.target)
        self.loss = tf.reduce_mean(self.losses)
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(self.loss)
        
    def predict(self, obs):
        return sess.run(self.value_estimate, feed_dict={
            self.observations: np.array(obs).reshape(-1, 8),
            self.training:     False
        })
    
    def update(self, obs, target, l_rate=0.01):
        inp = (self.train_op, self.loss)
        
        _, *results = self.sess.run(inp, feed_dict={
            self.l_rate:       l_rate,
            self.observations: np.array(obs).reshape(-1, 8),
            self.target:       np.array(target).reshape(-1, 1),
            self.training:     True
        })
        
        return results[0]
    

In [154]:
class ACHandler(object):
    def __init__(self, actor, critic, env, sess, path='./.model.ckpt'):
        self.actor = actor
        self.critic = critic
        self.env = env
        self.sess = sess
    
        self.saver = tf.train.Saver()
        self.path = path

    def init_vars(self):
        self.sess.run(tf.global_variables_initializer())

    def run(self, train_func, rollout=100, a_rate=0.001, c_rate=0.005, decay=0.99, render=False, e_encr=0.007, e_dscr=0.01, **kwargs):
        assert isinstance(train_func, str) and train_func.startswith('train_'), \
               'invalid train_func name specified'
        getattr(self, train_func)(self.rollout(rollout, render, decay), a_rate, c_rate, e_encr, e_dscr, **kwargs)
        
        # Close the display window
        if render: self.env.close()
            
    def run_constant_training(self, num_episodes, a_rate=0.001, c_rate=0.005, decay=0.99, e_encr=0.007, e_dscr=0.01, render=False, verbose=False):
        """
        Runs training and updates both networks during every time step
        """
        
        for _ in range(num_episodes):
            obs_curr = env.reset()
            done = False

            if verbose:
                rewards = 0
                a_episode_loss = []
                c_episode_loss = []
            
            while not done:

                if render: self.env.render()
                action = self.actor.choose_action(obs_curr)

                # Take action in environment.
                next_obs, reward, done, _ = self.env.step(action)

                next_estimate = self.critic.predict(next_obs)
                td_target = reward + decay * next_estimate
                td_error = td_target - self.critic.predict(obs_curr)
                c_loss = self.critic.update(obs_curr, td_target, c_rate)
                a_loss = self.actor.train(action, obs_curr, td_error, a_rate, e_encr, e_dscr)
                
                if verbose:
                    rewards += reward
                    a_episode_loss.append(a_loss)
                    c_episode_loss.append(c_loss)

                obs_curr = next_obs
                
            if verbose:
                print('Actor Loss: {0:5f}'.format(np.mean(a_episode_loss)), end='; ')
                print('Critic Loss: {0:5f}'.format(np.mean(c_episode_loss)), end='; ')
                print('Reward: {0:5f}'.format(rewards))
                
    def play(self, verbose=False):
        """
        Runs a single instance of the game without training or storing training information
        Always displays the game and closes the window afterward
        """
        obs_curr = self.env.reset()
        done = False
        
        while not done:
            self.env.render()

            # Agent chooses action based on difference frame.
            action = self.actor.choose_action(obs_curr, verbose=verbose)

            # Take action in environment.
            obs_curr, reward, done, _ = self.env.step(action)
            
        env.close()
        
    def train_rsample(self, batch, a_rate, c_rate, e_encr, e_dscr, num_epochs=50, mini_batch_size=100):
        """
        Performs random mini-batch training on both networks from a given
          set of batch information
        """
        for x in range(num_epochs):
            indices = np.random.randint(len(batch['obs']), size=mini_batch_size)
            loss = self.actor.train([batch['act'][i] for i in indices],
                             [batch['obs'][i] for i in indices],
                             [batch['advantage'][i] for i in indices],
                             a_rate,
                             e_encr,
                             e_dscr)
            self.critic.update([batch['obs'][i] for i in indices],
                               [batch['td_target'][i] for i in indices],
                               c_rate)
 
    def train_all(self, batch, a_rate, c_rate, e_encr, e_dscr, verbose=False):
        """
        Trains both networks on all peices of inromation in the batch
        """
        a_loss = self.actor.train(batch['act'],
                         batch['obs'],
                         batch['advantage'],
                         a_rate,
                         e_encr,
                         e_dscr)
        c_loss = self.critic.update(batch['obs'],
                           batch['td_target'],
                           c_rate)
        
        if verbose:
            print('Actor Loss: {:14.7f}'.format(a_loss), end='; ')
            print('Critic Loss: {:14.7f}'.format(c_loss), end='; ')
            print('Batch Reward: {:14.7f}'.format(batch['avg_rew']))
    
    def compute_advantage(self, obs, rewards, decay):
        disc_rewards = process_rewards(rewards, decay, norm=False)

        policy_target = np.zeros_like(disc_rewards)
        value_target = np.zeros_like(disc_rewards)
        running_reward = 0

        for idx in range(len(disc_rewards)):
            estimate = self.critic.predict(obs[idx])
            td_target = disc_rewards[idx]
            td_error = td_target - estimate

            
            policy_target[idx] = td_error
            value_target[idx] = td_target
        
        return policy_target.tolist(), value_target.tolist()    

    def save(self):
        self.saver.save(self.sess, self.path)
        
    def load(self):
        self.saver.restore(self.sess, self.path)
            
    def rollout(self, count, render, decay):
        batch = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
        rewards = 0
        
        for episode in range(count):
            # Stores all the stuff
            history = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
            
            obs_curr = env.reset()
            done = False

            while not done:
                
                if render: self.env.render()
                # Agent chooses action based on difference frame.
                action = self.actor.choose_action(obs_curr, False)
        
                # Take action in environment.
                next_obs, reward, done, _ = self.env.step(action)
                
                history['act'].append(action)
                history['obs'].append(obs_curr)
                history['rew'].append(reward)
                
                rewards += reward
                
                obs_curr = next_obs

            # Process rewards per episode.
            history['advantage'], history['td_target'] = self.compute_advantage(history['obs'] + obs_curr, history['rew'], decay)
            
            # Add episode to batch.
            for key in batch:
                batch[key].extend(history[key])
                
        batch['avg_rew'] = rewards / count
        
        return batch

        

In [155]:
tf.reset_default_graph()
env = gym.make('LunarLander-v2') # RGB observation space

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [156]:
sess = tf.Session()
actor = PolicyAgent(sess)
critic = Critic(sess)

In [157]:
handler = ACHandler(actor, critic, env, sess, '.models/l1.cpt')

In [158]:
handler.init_vars()

In [None]:
while(True):
    for _ in range(100):
        handler.run('train_rsample', render=True)
        print('-',end='')
    print('\nCompleted 100 Training Iterations\n')
    handler.save()

In [159]:
while(True):
    for _ in range(100):
        handler.run('train_all', rollout=100, a_rate=0.001, c_rate=0.012, decay=0.98, render=False, verbose=True)
        handler.play()
    print('Completed 100 Training Iterations\n')
    handler.save()

Actor Loss:     14.2725716; Critic Loss:   6674.8085938; Batch Reward:   -182.7051531
Actor Loss:      7.7797799; Critic Loss:   4844.3916016; Batch Reward:   -164.1492123
Actor Loss:      2.8584499; Critic Loss:   5623.4047852; Batch Reward:   -177.5454094
Actor Loss:     -3.4049821; Critic Loss:   5042.0375977; Batch Reward:   -177.1636169
Actor Loss:    -12.2737789; Critic Loss:   5371.4853516; Batch Reward:   -176.9775272
Actor Loss:    -19.4657402; Critic Loss:   5300.9829102; Batch Reward:   -179.8123028
Actor Loss:    -28.8472519; Critic Loss:   4514.5292969; Batch Reward:   -191.3567105
Actor Loss:    -43.9192963; Critic Loss:   4272.0781250; Batch Reward:   -184.4519796
Actor Loss:    -62.4619446; Critic Loss:   4082.3461914; Batch Reward:   -188.1801007
Actor Loss:    -80.9830704; Critic Loss:   3399.4272461; Batch Reward:   -190.5290059
Actor Loss:   -105.0353394; Critic Loss:   2769.9719238; Batch Reward:   -190.2266706
Actor Loss:   -147.8627319; Critic Loss:   2968.564453

Actor Loss:  -2180.0361328; Critic Loss:  32712.3457031; Batch Reward:   -638.9216252
Actor Loss:  -2691.4772949; Critic Loss:  18813.1777344; Batch Reward:   -609.0558543
Actor Loss:  -2227.9538574; Critic Loss:   5440.8989258; Batch Reward:   -401.4494535
Actor Loss:  -2151.0075684; Critic Loss:   2995.3618164; Batch Reward:   -167.2871622
Completed 100 Training Iterations

Actor Loss:  -2401.8774414; Critic Loss:   5234.5380859; Batch Reward:   -171.5977396
Actor Loss:  -2367.8454590; Critic Loss:   7426.3652344; Batch Reward:   -215.7665945
Actor Loss:  -3007.7465820; Critic Loss:   6856.4960938; Batch Reward:   -156.4754155
Actor Loss:  -2468.8437500; Critic Loss:   7094.5834961; Batch Reward:   -167.5690815
Actor Loss:  -2879.8032227; Critic Loss:   4806.9545898; Batch Reward:   -273.2228528
Actor Loss:  -3832.5632324; Critic Loss:   6303.0620117; Batch Reward:   -477.0620758
Actor Loss:  -2755.3666992; Critic Loss:   3427.3625488; Batch Reward:   -188.7677794
Actor Loss:  -2288.

Actor Loss:  -2290.4135742; Critic Loss:   6777.8256836; Batch Reward:   -208.4818041
Actor Loss:  -2843.0493164; Critic Loss:   4994.7099609; Batch Reward:   -149.6797872
Actor Loss:  -2565.4260254; Critic Loss:   5333.7163086; Batch Reward:   -177.6173619
Actor Loss:  -2522.3020020; Critic Loss:   5878.9086914; Batch Reward:   -233.7169445
Actor Loss:  -3133.5537109; Critic Loss:   4271.8144531; Batch Reward:   -336.9106583
Actor Loss:  -2473.1477051; Critic Loss:   3315.8000488; Batch Reward:   -187.3184157
Actor Loss:  -2229.0512695; Critic Loss:   2818.5148926; Batch Reward:   -189.7300486
Actor Loss:  -2151.5485840; Critic Loss:   2913.5053711; Batch Reward:   -210.1832741
Actor Loss:  -1883.9318848; Critic Loss:   2682.9248047; Batch Reward:   -190.3716504
Completed 100 Training Iterations

Actor Loss:  -1803.4573975; Critic Loss:   1547.8236084; Batch Reward:   -168.8103182
Actor Loss:  -1545.0113525; Critic Loss:   1237.2103271; Batch Reward:   -163.9435422
Actor Loss:  -1359.

Actor Loss:  -2141.2304688; Critic Loss:   2608.0324707; Batch Reward:   -201.0916862
Actor Loss:  -2303.8442383; Critic Loss:   3581.1613770; Batch Reward:   -302.0670851
Actor Loss:  -2346.5219727; Critic Loss:   1901.6900635; Batch Reward:   -220.4104912
Actor Loss:  -1931.6516113; Critic Loss:   1921.7589111; Batch Reward:   -206.0112789
Actor Loss:  -1957.2836914; Critic Loss:   2002.2246094; Batch Reward:   -223.5684471
Actor Loss:  -1886.7103271; Critic Loss:   2193.8205566; Batch Reward:   -217.8426391
Actor Loss:  -1697.2019043; Critic Loss:   1704.8646240; Batch Reward:   -193.1580142
Actor Loss:  -1638.5701904; Critic Loss:   2081.0153809; Batch Reward:   -205.7224278
Actor Loss:  -1546.6608887; Critic Loss:   1907.3679199; Batch Reward:   -190.6288145
Actor Loss:  -1520.6593018; Critic Loss:   1218.6881104; Batch Reward:   -194.7872847
Actor Loss:  -1437.3735352; Critic Loss:   2614.1872559; Batch Reward:   -231.8147297
Actor Loss:  -1513.1201172; Critic Loss:   1928.060180

Actor Loss:  -1199.4456787; Critic Loss:   1806.2487793; Batch Reward:   -185.9615937
Actor Loss:  -1109.6431885; Critic Loss:   2627.0480957; Batch Reward:   -226.0261347
Actor Loss:  -1285.2249756; Critic Loss:   1285.9381104; Batch Reward:   -173.9255299
Actor Loss:  -1492.6556396; Critic Loss:   6709.8188477; Batch Reward:   -364.1687829
Actor Loss:  -1430.5322266; Critic Loss:   1585.5247803; Batch Reward:   -187.1177138
Actor Loss:  -1978.4020996; Critic Loss:   6469.2241211; Batch Reward:   -383.2078244
Actor Loss:  -1654.2299805; Critic Loss:   3136.8649902; Batch Reward:   -239.8812086
Actor Loss:  -2436.9494629; Critic Loss:   6508.2641602; Batch Reward:   -413.7002804
Actor Loss:  -1724.6215820; Critic Loss:   2140.4147949; Batch Reward:   -205.6575738
Actor Loss:  -1683.7933350; Critic Loss:   1554.1553955; Batch Reward:   -182.2060610
Actor Loss:  -2238.5341797; Critic Loss:   3004.7297363; Batch Reward:   -314.0782035
Actor Loss:  -1710.3905029; Critic Loss:   2295.418945

Actor Loss:  -2106.5185547; Critic Loss:   3226.5368652; Batch Reward:   -353.4763035
Actor Loss:  -1585.8537598; Critic Loss:   2060.5383301; Batch Reward:   -175.8987587
Actor Loss:  -1597.9851074; Critic Loss:   2813.8398438; Batch Reward:   -235.5551370
Actor Loss:  -2123.2526855; Critic Loss:   4237.6826172; Batch Reward:   -344.0866790
Actor Loss:  -1577.4833984; Critic Loss:   1400.6983643; Batch Reward:   -177.7753954
Actor Loss:  -2720.8403320; Critic Loss:   5021.3964844; Batch Reward:   -432.1472578
Actor Loss:  -1504.2384033; Critic Loss:   2467.8376465; Batch Reward:   -213.2222942
Actor Loss:  -2084.6850586; Critic Loss:   3416.4682617; Batch Reward:   -290.1741274
Actor Loss:  -2689.4592285; Critic Loss:   3530.4975586; Batch Reward:   -407.0304380
Actor Loss:  -1421.7253418; Critic Loss:   3329.2336426; Batch Reward:   -238.8265226
Actor Loss:  -1921.2342529; Critic Loss:   1587.5911865; Batch Reward:   -186.1586495
Actor Loss:  -2344.3774414; Critic Loss:   3768.640869

Actor Loss:  -1589.1784668; Critic Loss:   1514.4897461; Batch Reward:   -198.2097276
Actor Loss:  -1448.5083008; Critic Loss:   1820.1807861; Batch Reward:   -219.4574161
Actor Loss:  -1502.1944580; Critic Loss:   3837.5480957; Batch Reward:   -341.6277342
Actor Loss:  -1142.9907227; Critic Loss:   2337.3442383; Batch Reward:   -212.0277774
Actor Loss:  -1392.3955078; Critic Loss:   1700.3935547; Batch Reward:   -203.7117516
Actor Loss:  -1427.1914062; Critic Loss:   2648.6838379; Batch Reward:   -228.6184525
Actor Loss:  -1348.6450195; Critic Loss:   1772.9803467; Batch Reward:   -187.5381460
Actor Loss:  -1584.7917480; Critic Loss:   4664.4394531; Batch Reward:   -359.0690908
Actor Loss:  -1235.7685547; Critic Loss:   1952.6529541; Batch Reward:   -216.3148371
Actor Loss:  -1495.8917236; Critic Loss:   2704.0485840; Batch Reward:   -272.0842155
Actor Loss:  -1471.7686768; Critic Loss:   3691.5097656; Batch Reward:   -329.4517130
Actor Loss:  -1336.4036865; Critic Loss:   1878.972900

Actor Loss:  -2094.4174805; Critic Loss:   3128.7468262; Batch Reward:   -267.1245167
Actor Loss:  -2073.0302734; Critic Loss:   1771.2359619; Batch Reward:   -195.3375229
Actor Loss:  -2023.1950684; Critic Loss:   2212.9157715; Batch Reward:   -250.3128146
Actor Loss:  -2038.4438477; Critic Loss:   2128.4780273; Batch Reward:   -204.1148408
Actor Loss:  -1767.5330811; Critic Loss:   2133.3571777; Batch Reward:   -213.3116398
Actor Loss:  -1708.4010010; Critic Loss:   1780.8090820; Batch Reward:   -196.2607651
Actor Loss:  -1638.1207275; Critic Loss:   2634.3049316; Batch Reward:   -243.2914788
Actor Loss:  -1703.0776367; Critic Loss:   1452.0113525; Batch Reward:   -204.1287293
Actor Loss:  -1654.8857422; Critic Loss:   2059.1989746; Batch Reward:   -230.9250396
Actor Loss:  -1447.1323242; Critic Loss:   1577.9160156; Batch Reward:   -217.9427839
Actor Loss:  -1551.0017090; Critic Loss:   2949.7714844; Batch Reward:   -275.9741871
Actor Loss:  -1386.2708740; Critic Loss:   2076.013183

Actor Loss:  -2084.5241699; Critic Loss:   2736.7680664; Batch Reward:   -271.3242226
Actor Loss:  -2145.5561523; Critic Loss:   2899.7614746; Batch Reward:   -278.2909208
Actor Loss:  -2198.0349121; Critic Loss:   2581.1550293; Batch Reward:   -261.5106560
Actor Loss:  -2520.6064453; Critic Loss:   4000.9428711; Batch Reward:   -347.3194463
Actor Loss:  -2303.3288574; Critic Loss:   2591.1013184; Batch Reward:   -219.2209409
Actor Loss:  -2497.6459961; Critic Loss:   3318.0520020; Batch Reward:   -289.1907139
Actor Loss:  -2631.6076660; Critic Loss:   2796.4301758; Batch Reward:   -306.6507422
Actor Loss:  -2527.8283691; Critic Loss:   3223.1455078; Batch Reward:   -268.3789497
Actor Loss:  -2366.9038086; Critic Loss:   2177.4414062; Batch Reward:   -233.3662863
Actor Loss:  -3143.9641113; Critic Loss:   5232.8178711; Batch Reward:   -423.7786430
Actor Loss:  -2209.3164062; Critic Loss:   1754.4448242; Batch Reward:   -206.2336140
Actor Loss:  -2861.0644531; Critic Loss:   3641.965332

Actor Loss:  -2795.4121094; Critic Loss:   3735.9770508; Batch Reward:   -264.9322461
Actor Loss:  -2757.1435547; Critic Loss:   3512.6162109; Batch Reward:   -283.1818087
Actor Loss:  -2786.3781738; Critic Loss:   3862.1923828; Batch Reward:   -307.7442401
Actor Loss:  -2536.6723633; Critic Loss:   2513.7832031; Batch Reward:   -247.7796758
Actor Loss:  -3016.5690918; Critic Loss:   4399.2539062; Batch Reward:   -372.1301502
Actor Loss:  -2227.0373535; Critic Loss:   2328.9892578; Batch Reward:   -228.6633323
Actor Loss:  -2381.5773926; Critic Loss:   3264.9001465; Batch Reward:   -328.4603473
Actor Loss:  -2289.1555176; Critic Loss:   3699.7104492; Batch Reward:   -298.3476951
Actor Loss:  -1969.0769043; Critic Loss:   2015.1667480; Batch Reward:   -213.3406958
Actor Loss:  -2735.7224121; Critic Loss:   6934.0708008; Batch Reward:   -416.5890184
Actor Loss:  -1865.9675293; Critic Loss:   1862.4725342; Batch Reward:   -218.3374652
Actor Loss:  -2697.1311035; Critic Loss:   5074.156250

Actor Loss:  -2124.1904297; Critic Loss:   2044.5101318; Batch Reward:   -208.3320737
Actor Loss:  -2429.0322266; Critic Loss:   3249.2749023; Batch Reward:   -293.4996837
Actor Loss:  -2052.9819336; Critic Loss:   2888.8806152; Batch Reward:   -247.0036197
Actor Loss:  -2061.4003906; Critic Loss:   1452.9227295; Batch Reward:   -201.0230806
Actor Loss:  -1970.6909180; Critic Loss:   2708.6054688; Batch Reward:   -258.9483678
Actor Loss:  -2032.0249023; Critic Loss:   2798.7451172; Batch Reward:   -291.5850562
Actor Loss:  -1838.0849609; Critic Loss:   2519.3222656; Batch Reward:   -228.2981461
Actor Loss:  -1877.0322266; Critic Loss:   1652.8181152; Batch Reward:   -184.9241306
Actor Loss:  -2277.9916992; Critic Loss:   6200.6708984; Batch Reward:   -369.9570436
Actor Loss:  -1763.1456299; Critic Loss:   1564.7484131; Batch Reward:   -206.1135542
Actor Loss:  -2229.2802734; Critic Loss:   6073.1625977; Batch Reward:   -379.2193339
Actor Loss:  -1704.6336670; Critic Loss:   2647.036865

Actor Loss:  -1973.1182861; Critic Loss:   1595.4129639; Batch Reward:   -203.4670624
Actor Loss:  -2069.3674316; Critic Loss:   1561.4425049; Batch Reward:   -215.7016820
Actor Loss:  -2041.1887207; Critic Loss:   3439.7685547; Batch Reward:   -285.5167854
Actor Loss:  -2052.1652832; Critic Loss:   4436.3422852; Batch Reward:   -294.1473578
Actor Loss:  -1910.0872803; Critic Loss:   2520.2226562; Batch Reward:   -247.1445230
Actor Loss:  -2067.6667480; Critic Loss:   2914.5292969; Batch Reward:   -284.8146316
Actor Loss:  -2119.0302734; Critic Loss:   1760.4749756; Batch Reward:   -243.8358833
Actor Loss:  -2214.9299316; Critic Loss:   2208.8872070; Batch Reward:   -245.3070822
Actor Loss:  -2205.9082031; Critic Loss:   1549.6560059; Batch Reward:   -212.1680405
Actor Loss:  -2246.2734375; Critic Loss:   2283.1716309; Batch Reward:   -251.5683724
Actor Loss:  -2056.0935059; Critic Loss:   1910.9770508; Batch Reward:   -240.0192961
Actor Loss:  -2290.2011719; Critic Loss:   2276.077392

Actor Loss:  -2550.5205078; Critic Loss:   2882.6899414; Batch Reward:   -201.0869511
Actor Loss:  -2830.6516113; Critic Loss:   3025.1291504; Batch Reward:   -270.9509120
Actor Loss:  -2518.1730957; Critic Loss:   2496.9975586; Batch Reward:   -220.0819904
Actor Loss:  -2302.2426758; Critic Loss:   2399.6901855; Batch Reward:   -218.8518335
Actor Loss:  -2382.1435547; Critic Loss:   2544.4145508; Batch Reward:   -273.5994010
Actor Loss:  -2019.9420166; Critic Loss:   1661.2149658; Batch Reward:   -181.8505976
Actor Loss:  -1951.5200195; Critic Loss:   1633.2729492; Batch Reward:   -230.9905925
Actor Loss:  -1731.2535400; Critic Loss:   1518.4052734; Batch Reward:   -220.8764807
Actor Loss:  -1639.8549805; Critic Loss:   3489.7636719; Batch Reward:   -275.8553817
Actor Loss:  -1533.3803711; Critic Loss:   1447.6541748; Batch Reward:   -187.1719603
Actor Loss:  -1430.1086426; Critic Loss:   1976.3601074; Batch Reward:   -213.4888476
Actor Loss:  -1420.6682129; Critic Loss:   1552.647338

Actor Loss:  -2184.2021484; Critic Loss:   3181.9611816; Batch Reward:   -314.5716767
Actor Loss:  -1874.1457520; Critic Loss:   2232.5524902; Batch Reward:   -229.3510713
Actor Loss:  -2393.8771973; Critic Loss:   3064.8977051; Batch Reward:   -324.7242336
Actor Loss:  -1960.5672607; Critic Loss:   2111.5537109; Batch Reward:   -215.1568016
Actor Loss:  -2621.2658691; Critic Loss:   3770.1003418; Batch Reward:   -343.7480147
Actor Loss:  -2001.4631348; Critic Loss:   1578.4790039; Batch Reward:   -192.8270344
Actor Loss:  -2620.1306152; Critic Loss:   3458.7668457; Batch Reward:   -343.8039708
Actor Loss:  -1991.5057373; Critic Loss:   2237.0371094; Batch Reward:   -221.9880216
Actor Loss:  -2321.6364746; Critic Loss:   2870.5187988; Batch Reward:   -300.7433924
Actor Loss:  -2133.7324219; Critic Loss:   2590.8405762; Batch Reward:   -262.3260396
Actor Loss:  -2218.9672852; Critic Loss:   2834.9677734; Batch Reward:   -300.8204205
Actor Loss:  -1972.6784668; Critic Loss:   1742.485351

Actor Loss:  -2318.6491699; Critic Loss:   3944.9396973; Batch Reward:   -331.8769629
Actor Loss:  -2012.1745605; Critic Loss:   1742.5347900; Batch Reward:   -194.6369970
Actor Loss:  -2486.2175293; Critic Loss:   5051.3139648; Batch Reward:   -390.9909011
Actor Loss:  -2150.6647949; Critic Loss:   1464.2624512; Batch Reward:   -199.2789583
Actor Loss:  -2512.8955078; Critic Loss:   3354.5568848; Batch Reward:   -322.8959498
Actor Loss:  -2457.3085938; Critic Loss:   1994.5610352; Batch Reward:   -282.0453680
Actor Loss:  -2601.4150391; Critic Loss:   3647.8459473; Batch Reward:   -321.9550424
Actor Loss:  -2457.9748535; Critic Loss:   2877.0249023; Batch Reward:   -248.8600146
Actor Loss:  -2488.9536133; Critic Loss:   2811.0925293; Batch Reward:   -256.2830507
Actor Loss:  -2651.6791992; Critic Loss:   3246.7795410; Batch Reward:   -321.7211010
Actor Loss:  -2292.0842285; Critic Loss:   2489.4060059; Batch Reward:   -255.6392466
Actor Loss:  -2585.0566406; Critic Loss:   3639.264892

Actor Loss:  -2211.9575195; Critic Loss:   2011.1906738; Batch Reward:   -257.8600433
Actor Loss:  -2342.9453125; Critic Loss:   3323.8615723; Batch Reward:   -307.1455922
Actor Loss:  -2321.8503418; Critic Loss:   3046.5942383; Batch Reward:   -299.3985623
Actor Loss:  -2364.0593262; Critic Loss:   2682.8293457; Batch Reward:   -313.3304455
Actor Loss:  -2286.3044434; Critic Loss:   3015.5336914; Batch Reward:   -309.9009666
Actor Loss:  -2501.9528809; Critic Loss:   2978.8969727; Batch Reward:   -306.9757330
Actor Loss:  -2577.5910645; Critic Loss:   2539.5183105; Batch Reward:   -313.0790535
Actor Loss:  -2510.5187988; Critic Loss:   2747.1589355; Batch Reward:   -317.6315198
Actor Loss:  -2727.8303223; Critic Loss:   2943.1691895; Batch Reward:   -306.4366027
Actor Loss:  -2590.8417969; Critic Loss:   2616.3981934; Batch Reward:   -285.9577052
Actor Loss:  -2780.1069336; Critic Loss:   3295.5241699; Batch Reward:   -314.5458011
Actor Loss:  -2526.6828613; Critic Loss:   3003.649414

Actor Loss:  -2336.1391602; Critic Loss:   2645.4729004; Batch Reward:   -261.9895067
Actor Loss:  -2538.1237793; Critic Loss:   4514.9458008; Batch Reward:   -368.6345387
Actor Loss:  -2232.5812988; Critic Loss:   2733.5639648; Batch Reward:   -244.5436446
Actor Loss:  -2497.2185059; Critic Loss:   4487.9018555; Batch Reward:   -381.8271038
Actor Loss:  -2242.1684570; Critic Loss:   2467.2675781; Batch Reward:   -269.0936591
Actor Loss:  -2448.7067871; Critic Loss:   3116.1845703; Batch Reward:   -350.1247984
Actor Loss:  -2426.1501465; Critic Loss:   2898.1462402; Batch Reward:   -340.6944306
Actor Loss:  -2419.6313477; Critic Loss:   2568.9465332; Batch Reward:   -275.8582972
Actor Loss:  -2759.0993652; Critic Loss:   3536.9443359; Batch Reward:   -363.0963281
Actor Loss:  -2512.6240234; Critic Loss:   2651.8093262; Batch Reward:   -253.2647993
Actor Loss:  -2844.7834473; Critic Loss:   3868.8549805; Batch Reward:   -362.0323600
Actor Loss:  -2569.1457520; Critic Loss:   3011.799804

Actor Loss:  -3256.1208496; Critic Loss:   4891.6206055; Batch Reward:   -426.3851060
Actor Loss:  -2679.6530762; Critic Loss:   3083.7536621; Batch Reward:   -249.8881587
Actor Loss:  -3157.9362793; Critic Loss:   4415.2744141; Batch Reward:   -384.6000101
Actor Loss:  -2898.5368652; Critic Loss:   3245.9897461; Batch Reward:   -320.0187747
Actor Loss:  -2592.5515137; Critic Loss:   3210.8525391; Batch Reward:   -341.3499625
Actor Loss:  -2677.5722656; Critic Loss:   2978.8103027; Batch Reward:   -313.9854044
Actor Loss:  -2539.4543457; Critic Loss:   4397.3427734; Batch Reward:   -369.0603134
Actor Loss:  -2426.0788574; Critic Loss:   2702.8454590; Batch Reward:   -280.4964736
Actor Loss:  -2647.3310547; Critic Loss:   3392.3691406; Batch Reward:   -369.8015013
Actor Loss:  -2429.7531738; Critic Loss:   2659.8249512; Batch Reward:   -269.6050127
Actor Loss:  -2493.1557617; Critic Loss:   3577.6528320; Batch Reward:   -364.4627450
Actor Loss:  -2503.5668945; Critic Loss:   2442.768310

KeyboardInterrupt: 

In [None]:
handler.run_constant_training(1000, render=False, decay=0.99, a_rate=0.002, c_rate=0.01, e_encr=0.008, e_dscr=0.5, verbose=True)

In [131]:
handler.load()

INFO:tensorflow:Restoring parameters from .models/l1.cpt


INFO:tensorflow:Restoring parameters from .models/l1.cpt


In [123]:
handler.save()

In [None]:
env.unwrapped.close()

In [104]:
while True: handler.play(verbose=True)

[[8.1062035e-06 2.1275442e-05 5.6120130e-05 9.9991453e-01]] [[-4.670416  -3.7054915 -2.7355506  7.0523796]] [0.00095868]
[[7.8590192e-06 2.0009598e-05 5.2955500e-05 9.9991918e-01]] [[-4.675467  -3.7409163 -2.767677   7.078301 ]] [0.00091109]
[[7.6236875e-06 1.8806415e-05 4.9962448e-05 9.9992359e-01]] [[-4.679423  -3.7764852 -2.7994115  7.1047506]] [0.00086573]
[[7.5754178e-06 1.7795739e-05 4.7487902e-05 9.9992716e-01]] [[-4.666027  -3.811976  -2.8304603  7.1245027]] [0.00082952]
[[7.5232933e-06 1.6968739e-05 4.5331431e-05 9.9993014e-01]] [[-4.6527047 -3.8393364 -2.8567085  7.144732 ]] [0.00079838]
[[7.3798687e-06 1.5923879e-05 4.2898635e-05 9.9993384e-01]] [[-4.6407466 -3.8716817 -2.8806615  7.1759424]] [0.0007607]
[[7.19089394e-06 1.49880525e-05 4.06964318e-05 9.99937177e-01]] [[-4.6342506 -3.899813  -2.9009259  7.208381 ]] [0.00072589]
[[6.9537482e-06 1.4069132e-05 3.8550577e-05 9.9994040e-01]] [[-4.6333547 -3.9286523 -2.9206645  7.2428155]] [0.00069117]
[[6.7149381e-06 1.3205517e-05

[[2.7771599e-05 3.1256684e-05 1.5614994e-04 9.9978489e-01]] [[-3.76889   -3.6506708 -2.0420866  6.7223916]] [0.0021993]
[[2.6898926e-05 2.8600434e-05 1.4679669e-04 9.9979776e-01]] [[-3.7522483 -3.6909125 -2.0552855  6.770974 ]] [0.0020802]
[[2.6026477e-05 2.6602622e-05 1.3686056e-04 9.9981052e-01]] [[-3.7363014 -3.7144065 -2.076453   6.8199053]] [0.00196204]
[[2.5450310e-05 2.5209209e-05 1.2822710e-04 9.9982113e-01]] [[-3.7165165 -3.7260349 -2.099441   6.8620877]] [0.00186415]
[[2.43509985e-05 2.41933176e-05 1.19729695e-04 9.99831676e-01]] [[-3.7162144 -3.7227106 -2.123551   6.906555 ]] [0.00176534]
[[2.2989590e-05 2.3535133e-05 1.1315743e-04 9.9984026e-01]] [[-3.7236786 -3.7002258 -2.12994    6.95663  ]] [0.00168431]
[[2.1036951e-05 2.2223687e-05 1.0187664e-04 9.9985480e-01]] [[-3.741583  -3.6867042 -2.1641002  7.027502 ]] [0.00154627]
[[1.9382080e-05 2.0805819e-05 9.2010370e-05 9.9986780e-01]] [[-3.7593172 -3.6884334 -2.201765   7.091712 ]] [0.00142191]
[[1.8229504e-05 1.9780391e-05 

[[8.8093418e-07 1.1538654e-06 2.9454138e-06 9.9999499e-01]] [[-4.6586704 -4.388781  -3.4516487  9.2836075]] [7.057573e-05]
[[7.1298257e-07 1.0026361e-06 2.3750929e-06 9.9999595e-01]] [[-4.726253  -4.3853216 -3.5229182  9.427552 ]] [5.8752397e-05]
[[5.7114579e-07 8.6207302e-07 1.8933069e-06 9.9999666e-01]] [[-4.7975464 -4.3858514 -3.5991104  9.578072 ]] [4.8534814e-05]
[[4.5685110e-07 7.4323037e-07 1.5068232e-06 9.9999726e-01]] [[-4.8708277 -4.3841796 -3.677426   9.728078 ]] [4.0099727e-05]
[[3.8818524e-07 6.7981887e-07 1.2674324e-06 9.9999762e-01]] [[-4.923238  -4.362894  -3.7399719  9.838543 ]] [3.4978748e-05]
[[3.2835118e-07 6.2261279e-07 1.0716783e-06 9.9999797e-01]] [[-4.977185  -4.3373446 -3.7942872  9.951995 ]] [3.055689e-05]
[[2.8507415e-07 5.8842880e-07 9.3380629e-07 9.9999821e-01]] [[-5.0225005 -4.297794  -3.8359804 10.048015 ]] [2.7490803e-05]
[[2.3534342e-07 5.3300744e-07 7.7787382e-07 9.9999845e-01]] [[-5.087093  -4.269603  -3.8915734 10.175126 ]] [2.378285e-05]
[[1.8872331

[[4.6533321e-05 1.1254563e-04 2.7016376e-04 9.9957079e-01]] [[-3.856328  -2.9731379 -2.0974681  6.1185846]] [0.00413638]
[[4.9053349e-05 1.0918181e-04 2.6740201e-04 9.9957436e-01]] [[-3.8111904 -3.0110848 -2.1153462  6.1109858]] [0.00410814]
[[5.06041251e-05 1.06344014e-04 2.63478229e-04 9.99579608e-01]] [[-3.7795432 -3.0368974 -2.1296062  6.1115136]] [0.00406524]
[[5.19160822e-05 1.03988685e-04 2.57972599e-04 9.99586165e-01]] [[-3.7485402 -3.053887  -2.1453156  6.116928 ]] [0.00401119]
[[5.1797208e-05 1.0096229e-04 2.5128687e-04 9.9959594e-01]] [[-3.7372699 -3.0698593 -2.1580112  6.1305003]] [0.00392695]
[[5.1002695e-05 9.5160714e-05 2.4325482e-04 9.9961060e-01]] [[-3.7225618 -3.0988727 -2.1603308  6.160681 ]] [0.00379882]
[[4.9593371e-05 9.0847556e-05 2.3689569e-04 9.9962258e-01]] [[-3.7257495 -3.1204233 -2.1619873  6.1855264]] [0.00369193]
[[4.8300346e-05 8.6631415e-05 2.3077709e-04 9.9963427e-01]] [[-3.7268317 -3.1426084 -2.1628182  6.210874 ]] [0.00358856]
[[4.7231213e-05 8.217724

[[3.7341233e-05 7.1306393e-05 1.8386240e-04 9.9970752e-01]] [[-3.9230227 -3.2761352 -2.328934   6.2720966]] [0.00293548]
[[3.7189930e-05 6.8864203e-05 1.8006761e-04 9.9971384e-01]] [[-3.9115493 -3.2954502 -2.334255   6.287637 ]] [0.00287797]
[[3.6810001e-05 6.5864937e-05 1.7707565e-04 9.9972028e-01]] [[-3.9007332 -3.318897  -2.3299263  6.3087277]] [0.00281939]
[[3.6585436e-05 6.3355430e-05 1.7468956e-04 9.9972540e-01]] [[-3.888399  -3.3392885 -2.325039   6.3271866]] [0.00277226]
[[3.6421799e-05 6.1130719e-05 1.7275887e-04 9.9972969e-01]] [[-3.8759747 -3.3581278 -2.319245   6.344098 ]] [0.00273235]
[[3.6140722e-05 5.9397335e-05 1.7144642e-04 9.9973303e-01]] [[-3.871272  -3.3744433 -2.3144212  6.356551 ]] [0.00270125]
[[3.5994432e-05 5.7454134e-05 1.7074688e-04 9.9973577e-01]] [[-3.8606696 -3.3930469 -2.303851   6.371213 ]] [0.00267479]
[[3.6218491e-05 5.5916997e-05 1.7100111e-04 9.9973685e-01]] [[-3.8436868 -3.4093883 -2.2915869  6.3819904]] [0.00266424]
[[3.6987174e-05 5.4725675e-05 1.

[[1.1422811e-05 1.7828173e-05 5.6732537e-05 9.9991405e-01]] [[-4.271473  -3.8263044 -2.6687365  7.10834  ]] [0.00096557]
[[1.1099445e-05 1.6985625e-05 5.5209999e-05 9.9991667e-01]] [[-4.269976  -3.8445036 -2.6657276  7.138556 ]] [0.00093781]
[[1.0738885e-05 1.6067961e-05 5.3620686e-05 9.9991953e-01]] [[-4.2675996 -3.864644  -2.6595366  7.173959 ]] [0.00090799]
[[1.0421979e-05 1.5288741e-05 5.2242900e-05 9.9992204e-01]] [[-4.2646594 -3.8814592 -2.6526723  7.2068563]] [0.00088214]
[[1.00685038e-05 1.45223075e-05 5.05580138e-05 9.99924898e-01]] [[-4.2653866 -3.8991132 -2.651677   7.240637 ]] [0.00085286]
[[9.7868960e-06 1.3944332e-05 4.9119448e-05 9.9992716e-01]] [[-4.2651443 -3.9111161 -2.6519332  7.269249 ]] [0.00082895]
[[9.3578274e-06 1.3281927e-05 4.7314315e-05 9.9993002e-01]] [[-4.2734356 -3.9232445 -2.652836   7.3057914]] [0.00079866]
[[8.9409159e-06 1.2650653e-05 4.5405246e-05 9.9993300e-01]] [[-4.2827873 -3.9357166 -2.6577976  7.3420186]] [0.00076765]
[[8.6359732e-06 1.2115655e-0

[[4.1022849e-05 6.1485589e-05 1.9987173e-04 9.9969769e-01]] [[-3.6241913 -3.2195177 -2.040645   6.4768877]] [0.00301534]
[[3.9354294e-05 5.9000879e-05 1.9461724e-04 9.9970704e-01]] [[-3.6342666 -3.229319  -2.035837   6.5083456]] [0.00292953]
[[3.9101993e-05 5.8490943e-05 1.9261090e-04 9.9970978e-01]] [[-3.630749  -3.2280502 -2.0362504  6.518298 ]] [0.00290488]
[[3.8552866e-05 5.7281890e-05 1.8958026e-04 9.9971455e-01]] [[-3.6289265 -3.2329721 -2.0361445  6.5342684]] [0.00286157]
[[3.7313850e-05 5.4998924e-05 1.8416368e-04 9.9972349e-01]] [[-3.6337004 -3.245751  -2.0372403  6.562169 ]] [0.00278012]
[[3.6689438e-05 5.4079032e-05 1.8131448e-04 9.9972790e-01]] [[-3.6330714 -3.2451131 -2.0353265  6.5796785]] [0.00274017]
[[3.5340581e-05 5.1460716e-05 1.7526864e-04 9.9973792e-01]] [[-3.639122  -3.2633357 -2.0378344  6.6110945]] [0.0026484]
[[3.4046348e-05 4.8685823e-05 1.6833049e-04 9.9974889e-01]] [[-3.6471195 -3.2894547 -2.048913   6.640417 ]] [0.00254752]
[[3.2988399e-05 4.6649358e-05 1.6

[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.1201105 -6.50337   -5.672362  17.94848  ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.145108  -6.528775  -5.7296624 17.992054 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.167099 -6.545199 -5.765886 18.03469 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.182697  -6.556741  -5.7976327 18.064957 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.197754  -6.567313  -5.8312325 18.093739 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.203061 -6.570788 -5.856919 18.104193]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.1937065 -6.5655227 -5.8702955 18.08938  ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.174487  -6.556907  -5.8819084 18.080278 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.16263  -6.55955  -5.899492 18.095003]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.151593 -6.564656 -5.920339 18.11616 ]] [6.9077553e-09]
[[1.e-10 1.e-10 1.e-10 1.e+00]] [[-8.151011  -6.5771217 

KeyboardInterrupt: 