In [1]:
import gym
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
## Basic NN with one hidden layer 
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, input_dim = 4, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation = "softmax"))
model.build()
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
compute_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [3]:
def discount_rewards(r, gamma = 0.8):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [4]:
env = gym.make('CartPole-v0')
episodes = 2000
scores = []
update_every = 5

gradBuffer = model.trainable_variables
for ix,grad in enumerate(gradBuffer):
    gradBuffer[ix] = grad * 0  
    
for e in range(episodes):
    s = env.reset()
    ep_memory = []
    ep_score = 0
    done = False 
    while not done: 
        s = s.reshape([1,4])
        with tf.GradientTape() as tape:
            #forward pass
            logits = model(s)
            a_dist = logits.numpy()
            # Choose random action with p = action dist
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)
            loss = compute_loss([a], logits)
        # make the choosen action 
        s, r, done, _ = env.step(a)
        ep_score +=r
        if done: r-=10
        grads = tape.gradient(loss, model.trainable_variables)
        ep_memory.append([grads,r])
        scores.append(ep_score)
    # Discound the rewards 
    ep_memory = np.array(ep_memory)
    ep_memory[:,1] = discount_rewards(ep_memory[:,1])

    for grads, r in ep_memory:
        for ix,grad in enumerate(grads):
            gradBuffer[ix] += grad * r

    if e % update_every == 0:
        optimizer.apply_gradients(zip(gradBuffer, model.trainable_variables))
        for ix,grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0

    if e % 100 == 0:
        print("Episode  {}  Score  {}".format(e, np.mean(scores[-100:])))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode  0  Score  14.0
Episode  100  Score  20.22
Episode  200  Score  35.29
Episode  300  Score  36.55
Episode  400  Score  54.4
Episode  500  Score  68.5
Episode  600  Score  150.5
Episode  700  Score  150.5
Episode  800  Score  45.94
Episode  900  Score  39.7
Episode  1000  Score  42.82
Episode  1100  Score  133.5
Episode  1200  Score  122.5
Episode  1300  Score  130.5
Episode  1400  Score  150.5
Episode  1500  Score  150.5
Episode  1600  Score  150.5
Episode  1700  Score  150.5
Episode  1800  Score  81.5
Episode  1900  Score  67.5
