In [1]:
import os

import glob
import io
import base64
import numpy as np
import gym as gym
from gym.wrappers import Monitor
from gym import logger as gymlogger
# gymlogger.set_level(40) #error only
from collections import namedtuple, deque
import matplotlib
import matplotlib.pyplot as plt
import time
from IPython.display import HTML
from IPython import display as ipythondisplay
import random
import copy
import tensorflow.contrib.eager as tfe
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.WARN)
tf.enable_eager_execution()

%matplotlib inline


In [2]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")


def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env


In [3]:
class RewardHistory(list):
    def plot(self):
        R = np.array(self)
        mu = np.mean(R, axis=0)
        std = np.std(R, axis=0)
        f, axarr = plt.subplots(1, 2)
        f.set_figheight(5)
        f.set_figwidth(20)
        f.subplots_adjust(hspace=0.2)
        axarr[0].plot(mu)
        axarr[0].set_title('Mean reward')
        axarr[0].set_xlabel("Episodes")
        axarr[0].set_ylabel("Reward")
        axarr[1].set_title('Std')
        axarr[1].set_xlabel("Episodes")
        axarr[1].set_ylabel("std")
        axarr[1].plot(std)
        plt.show()


In [4]:
Transition = namedtuple("Transition",['s','a','s_1','r','done'])

class PolicyGradientMemory(object):
    def __init__(self,gamma = 0.9):
        self.memory = []
        self.gamma = gamma
        
    def reset(self):
        self.memory = []
        
    def append(self, transition):
        self.memory.append(transition)
        
    def sample(self):
        
        batched = Transition(*zip(*self.memory))
        
        s = np.array(list(batched.s))
        a = np.array(list(batched.a))
        s_1 = np.array(list(batched.s_1))
        r = np.array(list(batched.r), dtype = 'float32')
        done = np.array(list(batched.done))
        
        reward = 0.
        for i in reversed(range(len(r))):
            moving_reward = self.gamma * reward
            reward = r[i] + moving_reward
            r[i] = reward
#             print(i,r)
        r = np.expand_dims(r, axis = 1)
        

        return [s, a, s_1, r, done]
            
    def __len__(self):
        return len(self.memory)
    
    def __str__(self):
        result = []
        
        for i in range(self.__len__()):
            result.append(self.memory[i].__str__()+'\n')
        return "".join(result)
    
    

In [5]:
"""
Play around with the memory to understand how it works. 

"""

# Learn how memory works
memory = PolicyGradientMemory()

memory.append(Transition([1, 2, 0], 1, [4, 5, 6],  0, False))
memory.append(Transition([1, 2, 1], 0, [4, 5, 6],  0, False))
memory.append(Transition([1, 2, 2], 0, [4, 5, 6],  1, False))
memory.append(Transition([1, 2, 3], 1, [4, 5, 6],  0, False))
memory.append(Transition([1, 2, 4], 1, [4, 5, 6],  0, False))

memory.append(Transition([1, 2, 5], 1, [4, 5, 6], -1, True))

In [6]:
class SimplePolicyModel(tf.keras.Model):
    def __init__(self, outputs = 2):
        super().__init__()
        self.layer = tf.keras.layers.Dense(units = outputs, activation= tf.nn.softmax)
        
        self.layer1 = tf.keras.layers.Dense(units = 64, activation= tf.nn.relu)
        self.layer2 = tf.keras.layers.Dense(units = 64, activation= tf.nn.relu)
        self.layer3 = tf.keras.layers.Dense(units = outputs, activation= tf.nn.softmax)
        
    def call(self,inputs):
#         result = self.layer(inputs)
        result = self.layer1(inputs)
        result = self.layer2(result)
        result = self.layer3(result)

        return result

In [65]:
class SimplePolicyModel(tf.keras.Model):
    def __init__(self, outputs = 2):
        super().__init__()
        self.layer = tf.keras.layers.Dense(units = outputs, activation= tf.nn.softmax)

        
    def call(self,inputs):
        result = self.layer(inputs)

        return result

In [53]:

class Agent():
    def __init__(self, env = None):
        self.env = env
        self.episode_durations = []
        self.episode_loss = []
        self.action_space = self.env.action_space.n
        self.model = SimplePolicyModel(self.action_space)
        self.memory = PolicyGradientMemory()
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.1)

        self.act = [i for i in range(env.action_space.n)]
        
    def get_action(self, s):
        s = tf.convert_to_tensor([s], dtype = tf.float32)
        action_prob = self.model(s)
        
        action_prob = np.round(action_prob[0],6)
#         print(action_prob)
        
        action = np.random.choice(self.act, p = action_prob)
        return action
    
    def train(self, episodes = 2):
        all_returns = [[] for _ in range(2000)]
        
        for i in range(episodes):
            s = self.env.reset()
            step = 0
            
            while True:
                action = self.get_action(s)
#                 print(action)
                s_1, reward, done, _ = self.env.step(action)
                
                # Redesign Reward
                
#                 r0 = 0
#                 if reward > -1:
#                     r0 = (reward +1) * 2000
                    
#                 reward = abs(s_1[0] - (-0.5)) + r0
        
                self.memory.append(Transition(s, action, s_1, reward, done))
                s = s_1

                step += 1
                if done:
                    break
            
            s, a, s_1, r, done  = self.memory.sample()
#             print(type(s),s.shape)
            
            s = tf.convert_to_tensor(s,dtype = tf.float32)
            
            a = tf.one_hot(a, depth=self.action_space, dtype=tf.int32)
            b = tf.zeros_like(r).numpy()
            
            
            for t in range(len(r)):
                all_returns[t].append(r[t])
                b[t] = np.mean(all_returns[t])
#                 print('r:',r[t],'AllR: ',all_returns[t])
            
#             print('b:',b)
            
            r = tf.constant(r, dtype = tf.float32)
            baseline = tf.constant(b, dtype = tf.float32)
            
#             print('baseline',baseline)
            
            with tfe.GradientTape() as tape:
                action_prob = self.model(s)
                
                loss = action_prob * tf.cast(a, dtype = tf.float32)
                loss = tf.reduce_sum(loss, reduction_indices=1)
                loss = tf.log(loss)
                loss_value = - tf.reduce_mean(loss*(r-b)) # why r - b?
#                 print('r: ', r,'\nb: ',b ,'\nloss: ', loss, '\nloss_value: ', loss_value)
            grads = tape.gradient(loss_value, self.model.variables)
            self.optimizer.apply_gradients(zip(grads, self.model.variables),
                                           global_step= tf.train.get_or_create_global_step())
            
            self.memory.reset()
            
            self.episode_durations.append(step)
            
#             print('running... episode =', i, 'loss = ', loss_value.numpy().item())
        
    def run(self,env):
        self.env = env
        s = self.env.reset()
        step = 0

        while True:
            self.env.render()
            action = self.get_action(s)
            s_1, r, done, _ = self.env.step(action)
            s = s_1

#             r0 = 0
#             if r > -1:
#                 r0 = (r +1) * 2000
                    
#             r = abs(s_1[0] - (-0.5)) + r0
            
            
            step += 1

            if done:
                print("Episode finished successfully after {} timesteps".format(step))
                break;

        self.env.close()

In [50]:
env = gym.make('MountainCar-v0')

env = gym.make('CartPole-v0')

In [51]:
PolicyAgent = Agent(env)
PolicyAgent.train(1000)

In [117]:

PolicyAgent.train(1000)

In [52]:
env = wrap_env(gym.make('CartPole-v0'))

PolicyAgent.run(env)
show_video()

Episode finished successfully after 200 timesteps


In [66]:
class ValueModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units = 64,activation='relu')
        self.dense2 = tf.keras.layers.Dense(units = 1)
        
    def call(self,inputs):
        result = self.dense1(inputs)
        result = self.dense2(result)
        return result


In [76]:
class ValueNetAgent():
    def __init__(self, env = None):
        self.env = env
        self.episode_duration = []
        self.episode_loss = []
        self.action_space = self.env.action_space.n
        self.policy_model = SimplePolicyModel(self.action_space)
        self.value_model = ValueModel()
        self.memory = PolicyGradientMemory()
        self.policy_optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
        self.value_optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
        self.act = [i for i in range(env.action_space.n)]
        
        
    def get_actions(self, s):
        s = tf.convert_to_tensor([s], dtype = tf.float32)
        
        self.action_prob = self.policy_model(s)
        action_prob = np.squeeze(VAgent.action_prob)
        action_prob = np.round(action_prob,6)
#         print(action_prob)
        action = np.random.choice(self.act, p = action_prob)
        return action
    
    def train(self, episodes = 2):
        
        for i in range(episodes):
            s = self.env.reset()
            step = 0
            
            while True:
                action = self.get_actions(s)
                s_1, r, done, info = self.env.step(action)
                self.memory.append(Transition(s, action, s_1, r, done))
                s = s_1
                step += 1 
                
                if done:
                    break
            
            s, a, s_1, r, done= self.memory.sample()
            s = tf.constant(s,dtype = tf.float32)
            a = tf.one_hot(a, depth = self.action_space, dtype = tf.int32)
            r = tf.constant(r, dtype = tf.float32)
            
            # V_s 代替 Baseline
            V_s = self.value_model(s)
            
            with tfe.GradientTape() as tape:
                v = self.value_model(s)
                loss = (r - v) ** 2
                loss_value = tf.reduce_mean(loss)
                
            grads = tape.gradient(loss_value, self.value_model.variables)
            self.value_optimizer.apply_gradients(zip(grads, self.value_model.variables),
                                                 global_step= tf.train.get_or_create_global_step())
            
            
            
            
            with tfe.GradientTape() as tape:
                action_prob = self.policy_model(s)
                loss = action_prob * tf.cast(a, dtype = tf.float32)
                loss = tf.reduce_sum(loss, reduction_indices=1)
                loss = tf.log(loss)
                loss_value = - tf.reduce_mean(loss * (r - V_s))
                
                
            grads = tape.gradient(loss_value, self.policy_model.variables)
            self.value_optimizer.apply_gradients(zip(grads, self.policy_model.variables),
                                                 global_step= tf.train.get_or_create_global_step())            
            
            self.memory.reset()
            self.episode_duration.append(step)
            
            
    def run(self, env = gym.make('CartPole-v0')):
        self.env = env
        s = self.env.reset()
        step = 0
        while True:
            action = self.get_actions(s)
            s_1, reward, done, info = self.env.step(action)
            s = s_1
            step += 1

            if done:
                print("Episode finished successfully after {} timesteps".format(step))
                break

        self.env.close()

In [77]:
env = gym.make('CartPole-v0')
VAgent = ValueNetAgent(env)
VAgent.train(500)

In [78]:
env = wrap_env(gym.make("CartPole-v0"))

VAgent.run(env)
show_video()

Episode finished successfully after 200 timesteps


In [177]:
class SimplePolicyModel(tf.keras.Model):
    def __init__(self, output = 2):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units = 128, activation = tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units = output, activation = tf.nn.softmax)
        
    def call(self, inputs):
        result = self.dense1(inputs)
        result = self.dense2(result)    
        return result

    
class ValueModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units = 256, activation = tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units = 1)
        
    def call(self, inputs):
        result = self.dense1(inputs)
        result = self.dense2(result)    
        return result


In [145]:
class SimplePolicyModel(tf.keras.Model):
    def __init__(self, output = 2):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units = 64, activation = tf.nn.elu, kernel_initializer = tf.initializers.glorot_uniform)
        self.dense2 = tf.keras.layers.Dense(units = 64, activation = tf.nn.elu, kernel_initializer = tf.initializers.glorot_uniform)
        self.dense3 = tf.keras.layers.Dense(units = output, activation = tf.nn.softmax)
        
    def call(self, inputs):
        result = self.dense1(inputs)
        result = self.dense2(result)  
        result = self.dense3(result) 
        return result

    
class ValueModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units = 256, activation = tf.nn.elu, kernel_initializer = tf.initializers.glorot_uniform)
        self.dense2 = tf.keras.layers.Dense(units = 256, activation = tf.nn.elu, kernel_initializer = tf.initializers.glorot_uniform)
        self.dense3 = tf.keras.layers.Dense(units = 1)
        
    def call(self, inputs):
        result = self.dense1(inputs)
        result = self.dense2(result)  
        result = self.dense3(result)  
        return result

In [149]:
class PolicyGradientMemoryNoDiscount():
    def __init__(self, gamma = 0.9):
        self.memory = []
        self.gamma = gamma
        
    def reset(self):
        self.memory = []
    
    def append(self, transition):
        self.memory.append(transition)
        
        
    def sample(self):
        batched = Transition(*zip(*self.memory))
        s = np.array(list(batched.s))
        a = np.array(list(batched.a))
        s_1 = np.array(list(batched.s_1))
        r = np.array(list(batched.r), dtype="float32")
        done = np.array(list(batched.done))
        r = np.expand_dims(r, axis=1)
        return [s, a, s_1, r, done]
    
    def __len__(self):
        return len(self.memory)
    
    def __str__(self):
        result = []
        for i in range(self.__len__()):
            result.append(self.memory[i].__str__() + " \n")
        return "".join(result)


In [252]:
class A2C_Agent():
    def __init__(self, env):
        self.env = env
        self.episode_durations = []
        self.episode_loss = []
        self.gamma = 0.99
        self.action_space = self.env.action_space.n
        self.policy_model = SimplePolicyModel(self.action_space)
        self.value_model = ValueModel()
        self.memory = PolicyGradientMemoryNoDiscount()
        self.value_optimizer = tf.train.AdamOptimizer(learning_rate = 0.01)
        self.policy_optimizer = tf.train.AdamOptimizer(learning_rate = 0.01)
    
        self.act = [i for i in range(self.action_space)]
        
        
    def get_actions(self, s):
        
        s = tf.convert_to_tensor([s], dtype = tf.float32)
        action_prob = self.policy_model(s).numpy()
        action_prob = np.round(action_prob.squeeze(),6)
        action = np.random.choice(self.act, p = action_prob)
        return action
    
    def train(self, episodes = 2):
        
        for i in range(episodes):
            
            self.memory.reset()
            s = self.env.reset()
            step = 0
            
            while True:
                action = self.get_actions(s)
                s_1, r, done, info = self.env.step(action)
                
                # Redesign Reward
                
#                 r0 = 0
#                 if r > -1:
#                     r0 = (r +1) * 2000
#                 r = abs(s_1[0] - (-0.5)) + r0
                
                self.memory.append(Transition(s, action, s_1, r, done))
                
                s = s_1
                step += 1
                
                if done:
                    break
                    
            s, a, s_1, r, done =  self.memory.sample()
            
            s = tf.constant(s, dtype = tf.float32)
            a = tf.one_hot(a, depth=self.action_space, dtype = tf.int32)
            r = tf.constant(r, dtype = tf.float32)
            s_1 = tf.constant(s_1, dtype = tf.float32)
            
            d = tf.expand_dims(tf.constant(1-done, dtype = tf.float32), axis = 1) # stop s_1 reward if done = True
            
            
            ## key changes 
            # ADV Fomula:   ADV = (r + gamma * V(s_1)) - v(s)
            # ADV = (Reward + discounted next critic value of state) - (this critic value of state)  
            v = self.value_model(s)
            v_prime = self.value_model(s_1)
            
            q = r + self.gamma * d * v_prime # stop s_1 reward if done = True
            
            adv = q - v
            print('d :', d,'\n\nv_prime :',v_prime,'\n\nq :',q,'\n\nv :',v,'\n\nadv :',adv)
            
            with tf.GradientTape() as tape:
                v = self.value_model(s)
                loss = (q - v) ** 2
                loss_value = tf.reduce_mean(loss)
            grads = tape.gradient(loss_value, self.value_model.variables)
            self.value_optimizer.apply_gradients(zip(grads, self.value_model.variables),
                                                 global_step = tf.train.get_or_create_global_step())
                
            
            with tf.GradientTape() as tape:
                action_prob = self.policy_model(s)
                
                loss = action_prob * tf.cast(a, dtype = tf.float32)
                loss = tf.reduce_sum(loss, reduction_indices=1)
                loss = tf.log(loss)
                loss_value = - tf.reduce_mean(loss * adv) ## KEY ACTION
                
            grads = tape.gradient(loss_value, self.policy_model.variables)
            self.policy_optimizer.apply_gradients(zip(grads, self.policy_model.variables),
                                                 global_step = tf.train.get_or_create_global_step())
            
            self.episode_durations.append(step)
            
    def run(self, env):
        self.env = env
        s = self.env.reset()
        step = 0
        while True:
            self.env.render()
            action = self.get_actions(s)
            s_1, r, done, info = self.env.step(action)
            
            s = s_1
            step += 1
            if done:
                print("Episode finished successfully after {} timesteps".format(step))
                break
        self.env.close()

In [253]:
env = gym.make('CartPole-v0')
A2C_PolicyAgent = A2C_Agent(env)
A2C_PolicyAgent.train(10)

d : tf.Tensor(
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]], shape=(13, 1), dtype=float32) 

v_prime : tf.Tensor(
[[0.01215432]
 [0.02886874]
 [0.04535594]
 [0.06186518]
 [0.04653038]
 [0.0327043 ]
 [0.05053117]
 [0.03822826]
 [0.02719213]
 [0.04592883]
 [0.06477666]
 [0.055074  ]
 [0.07443726]], shape=(13, 1), dtype=float32) 

q : tf.Tensor(
[[1.0120327]
 [1.0285801]
 [1.0449023]
 [1.0612465]
 [1.0460651]
 [1.0323772]
 [1.0500258]
 [1.037846 ]
 [1.0269202]
 [1.0454695]
 [1.0641289]
 [1.0545232]
 [1.       ]], shape=(13, 1), dtype=float32) 

v : tf.Tensor(
[[-0.00344039]
 [ 0.01215432]
 [ 0.02886874]
 [ 0.04535594]
 [ 0.06186518]
 [ 0.04653038]
 [ 0.0327043 ]
 [ 0.05053117]
 [ 0.03822826]
 [ 0.02719213]
 [ 0.04592883]
 [ 0.06477666]
 [ 0.055074  ]], shape=(13, 1), dtype=float32) 

adv : tf.Tensor(
[[1.0154731]
 [1.0164257]
 [1.0160335]
 [1.0158906]
 [0.9841999]
 [0.9858469]
 [1.0173215]
 [0.9873148]
 [0.9886919]
 [1.0182774]
 [1.0182   ]
 [0.9897466]
 

d : tf.Tensor(
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]], shape=(17, 1), dtype=float32) 

v_prime : tf.Tensor(
[[0.64397687]
 [0.6336972 ]
 [0.7575054 ]
 [0.63291836]
 [0.75167936]
 [1.0608149 ]
 [1.3935616 ]
 [1.7330905 ]
 [1.4184921 ]
 [1.768259  ]
 [2.1266892 ]
 [2.4957664 ]
 [2.2119582 ]
 [1.9355538 ]
 [1.6656151 ]
 [2.056433  ]
 [1.8005538 ]], shape=(17, 1), dtype=float32) 

q : tf.Tensor(
[[1.6375371]
 [1.6273602]
 [1.7499304]
 [1.6265892]
 [1.7441626]
 [2.0502067]
 [2.379626 ]
 [2.7157598]
 [2.4043071]
 [2.7505765]
 [3.1054223]
 [3.4708087]
 [3.1898386]
 [2.9161983]
 [2.648959 ]
 [3.0358686]
 [1.       ]], shape=(17, 1), dtype=float32) 

v : tf.Tensor(
[[0.6392518 ]
 [0.64397687]
 [0.6336972 ]
 [0.7575054 ]
 [0.63291836]
 [0.75167936]
 [1.0608149 ]
 [1.3935616 ]
 [1.7330905 ]
 [1.4184921 ]
 [1.768259  ]
 [2.1266892 ]
 [2.4957664 ]
 [2.2119582 ]
 [1.9355538 ]
 [1.6656151 ]
 [2.056433  ]], shape=(17, 1), dtype=float32) 


In [239]:
A2C_PolicyAgent.train(100)

In [241]:
env = wrap_env(gym.make("CartPole-v0"))

A2C_PolicyAgent.run(env)
show_video()

Episode finished successfully after 200 timesteps


In [240]:
A2C_PolicyAgent.episode_durations

[18,
 42,
 45,
 38,
 22,
 33,
 33,
 77,
 27,
 61,
 31,
 31,
 20,
 28,
 10,
 39,
 38,
 59,
 26,
 51,
 31,
 26,
 17,
 54,
 16,
 11,
 14,
 17,
 15,
 27,
 39,
 22,
 41,
 31,
 19,
 16,
 63,
 45,
 24,
 58,
 19,
 38,
 18,
 32,
 27,
 52,
 21,
 21,
 58,
 28,
 39,
 22,
 37,
 20,
 40,
 35,
 9,
 32,
 30,
 65,
 47,
 26,
 66,
 26,
 40,
 17,
 56,
 46,
 41,
 36,
 26,
 27,
 23,
 25,
 18,
 35,
 50,
 46,
 33,
 46,
 71,
 51,
 107,
 57,
 70,
 29,
 51,
 67,
 56,
 55,
 38,
 44,
 56,
 50,
 73,
 86,
 49,
 41,
 43,
 42,
 37,
 58,
 49,
 58,
 46,
 65,
 61,
 58,
 56,
 74,
 79,
 48,
 37,
 59,
 73,
 36,
 62,
 57,
 27,
 71,
 79,
 51,
 47,
 50,
 73,
 50,
 43,
 47,
 64,
 43,
 64,
 92,
 39,
 40,
 46,
 37,
 73,
 54,
 34,
 49,
 36,
 82,
 50,
 64,
 39,
 61,
 36,
 50,
 38,
 36,
 27,
 55,
 66,
 45,
 66,
 56,
 59,
 74,
 55,
 91,
 68,
 200,
 54,
 102,
 56,
 119,
 51,
 76,
 115,
 46,
 76,
 80,
 55,
 107,
 75,
 150,
 56,
 64,
 90,
 145,
 93,
 32,
 74,
 78,
 74,
 151,
 122,
 124,
 71,
 184,
 100,
 82,
 88,
 113,
 84,
 74,
 76,
 7

In [164]:
env = gym.make('MountainCar-v0')
A2C_PolicyAgent = A2C_Agent(env)
A2C_PolicyAgent.train(500)

In [174]:
A2C_PolicyAgent.train(500)

In [175]:
env = wrap_env(gym.make('MountainCar-v0'))

A2C_PolicyAgent.run(env)
show_video()

Episode finished successfully after 200 timesteps


In [157]:
pd.Series(A2C_PolicyAgent.episode_durations).min()

200

In [171]:
s, a, s1, r, d = A2C_PolicyAgent.memory.sample()

In [172]:
r

array([[0.06526598],
       [0.06664776],
       [0.06870757],
       [0.07143009],
       [0.07479511],
       [0.07877766],
       [0.08334827],
       [0.08847314],
       [0.09411453],
       [0.10023098],
       [0.10677774],
       [0.1137071 ],
       [0.12096886],
       [0.12851068],
       [0.1362786 ],
       [0.14421742],
       [0.15227121],
       [0.16038375],
       [0.16849895],
       [0.1765613 ],
       [0.18451627],
       [0.19231069],
       [0.19989312],
       [0.20721412],
       [0.2142266 ],
       [0.22088604],
       [0.22715071],
       [0.23298188],
       [0.23834391],
       [0.24320447],
       [0.24753459],
       [0.2513087 ],
       [0.25450474],
       [0.25710422],
       [0.25909218],
       [0.26045722],
       [0.26119158],
       [0.2612911 ],
       [0.2607552 ],
       [0.25958696],
       [0.25779298],
       [0.25538352],
       [0.2523724 ],
       [0.24877708],
       [0.24461845],
       [0.23992099],
       [0.23471263],
       [0.229

In [176]:
3**10


59049