In [2]:
import gym
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow import keras
import copy

print(tf.__version__)
print(device_lib.list_local_devices())

2.8.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 916079750291223612
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3667263488
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14999928193529444350
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


In [22]:
def make_dense_model(hidden_structure,input_shape,act_function,output_shape):
    inputs = tf.keras.Input(shape=input_shape)
    
    for i, val in enumerate(hidden_structure):
        n_percep = val 
        if i == 0:
            # x = keras.layers.Dense(n_percep, activation= act_function)(inputs)
            x = keras.layers.Dense(n_percep)(inputs)

        elif  i != 0 and i != len(hidden_structure)-1:
            x = keras.layers.Dense(n_percep, activation= act_function)(x)
            # x = keras.layers.Dense(n_percep)(x)
            

        elif  i == len(hidden_structure)-1:
            x = keras.layers.Dense(n_percep,activation= act_function)(x)
            # x = keras.layers.Dense(n_percep)(x)
    x = keras.layers.Dense(output_shape, activation='softmax')(x)

    dense_model = tf.keras.Model(inputs=inputs, outputs=x)
    return dense_model

def sample_noise(net):
    pos = [] 
    neg = []
    for layer in net.trainable_weights:
        noise = tf.convert_to_tensor(np.random.normal(loc=0.0, scale=NOISE_STD, size=layer.shape),dtype=tf.float32)
        pos.append(noise)
        neg.append(-noise)
    return pos, neg

def evaluate(env, net):
    s = env.reset()
    reward = 0.0
    steps = 0
    s_v = tf.convert_to_tensor([s],dtype=tf.float32)
    a = net(s_v).numpy().argmax()
    while True:
        ns, r, done, _ = env.step(a)
        reward += r
        steps += 1
        if done:
            break
    return reward, steps

def eval_with_noise(env, net, noise):
    old_params = copy.deepcopy(net.weights)
    for w, w_n in zip(net.trainable_variables, noise):
        w.assign_add(w_n)
    r, s = evaluate(env, net)
    net.set_weights(old_params)
    return r, s

def train_step(net, batch_noise, batch_reward, step_idx):
    norm_reward = np.array(batch_reward)
    norm_reward -= np.mean(norm_reward)
    s = np.std(norm_reward) 
    if abs(s) > 1e-6:
        norm_reward /= s
    weighted_noise = None
    for noise, reward in zip(batch_noise, norm_reward):
        if weighted_noise is None:
            weighted_noise = [reward * p_n for p_n in noise]
        else:
            for w_n, p_n in zip(weighted_noise, noise):
                w_n += reward * p_n
    
    for p, p_update in zip(net.trainable_variables, weighted_noise):
        update = p_update / (len(batch_reward) * NOISE_STD)
        grad = tf.convert_to_tensor(LEARNING_RATE * update,dtype=tf.float32)
        p.assign_add(grad)
        

MAX_BATCH_EPISODES = 100
MAX_BATCH_STEPS = 10000
NOISE_STD = 0.1
LEARNING_RATE = 0.001
hidden_structure= [32]

input = 4
output = 2

env = gym.make('CartPole-v1')

model = make_dense_model(hidden_structure, input, 'relu', output)

In [23]:
step_idx = 0
while True:
    t_start = time.time()
    batch_noise = []
    batch_reward = []
    batch_steps = 0
    for _ in range(MAX_BATCH_EPISODES):
        noise, neg_noise = sample_noise(model)
        batch_noise.append(noise)
        batch_noise.append(neg_noise)
        reward, steps = eval_with_noise(env, model, noise)
        batch_reward.append(reward)
        batch_steps += steps
        reward, steps = eval_with_noise(env, model, neg_noise)
        batch_reward.append(reward)
        batch_steps += steps
        if batch_steps > MAX_BATCH_STEPS:
            break
    step_idx += 1
    m_reward = np.mean(batch_reward)
    if m_reward > 199:
        print("Solved in %d steps" % step_idx)
        break
    train_step(model, batch_noise, batch_reward,step_idx)
    print(f"reward_avg: {m_reward}")
    print(f"batch_steps: {batch_steps}, step_idx: {step_idx}")
    print(f"iter per s: {batch_steps/(time.time()-t_start)}")









reward_avg: 9.385
batch_steps: 1877, step_idx: 1
iter per s: 1259.732140279798
reward_avg: 9.365
batch_steps: 1873, step_idx: 2
iter per s: 1298.8894352046752
reward_avg: 9.2
batch_steps: 1840, step_idx: 3
iter per s: 1226.667251587193
reward_avg: 9.415
batch_steps: 1883, step_idx: 4
iter per s: 1265.4578716640603
reward_avg: 9.33
batch_steps: 1866, step_idx: 5
iter per s: 1280.7118942385262
reward_avg: 9.385
batch_steps: 1877, step_idx: 6
iter per s: 1276.873208593141
reward_avg: 9.405
batch_steps: 1881, step_idx: 7
iter per s: 1312.6314191901251
reward_avg: 9.415
batch_steps: 1883, step_idx: 8
iter per s: 1259.529860326794
reward_avg: 9.29
batch_steps: 1858, step_idx: 9
iter per s: 1240.3218864693317


KeyboardInterrupt: 