In [1]:
import gym
# import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Sequential

In [2]:
env = gym.make('CartPole-v1')

In [3]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1
totals = []
for episode in range(500):
    episode_rewards = 0
    env.seed(episode)
    obs = env.reset()
    for step in range(200):
        action = basic_policy(obs)
        obs , reward , done , info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [4]:
obs = env.reset()
obs

array([-0.04408514, -0.03586387, -0.01099802, -0.00518236], dtype=float32)

In [5]:
print(np.mean(totals) , np.std(totals) , min(totals) , max(totals))

42.47 9.303176876744846 24.0 71.0


In [6]:
model = Sequential([
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [7]:
def play_one_step(env, obs, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target , left_proba))
    grads = tape.gradient(loss , model.trainable_variables)
    # print('gradient:',grads)
    obs , reward , done , info = env.step(int(action))
    return obs, reward, done, info, grads

def play_multiple_episodes(env , n_episodes , n_max_steps , loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, info, grads = play_one_step(env, obs, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards,all_grads

def discount_rewards(rewards, discount_factor):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    cumulative = 0.0
    for i in reversed(range(len(rewards))):
        cumulative = rewards[i] + cumulative * discount_factor
        discounted_rewards[i] = cumulative
    return discounted_rewards

def discount_and_normalize(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    rewards_mean = flat_rewards.mean()
    rewards_std = flat_rewards.std()
    return [(discounted_rewards-rewards_mean)/rewards_std for discounted_rewards in all_discounted_rewards]

n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 100
discount_factor = 0.95

optimizer = tf.keras.optimizers.Nadam(learning_rate = 0.01)
loss_fn = tf.keras.losses.binary_crossentropy

for iteration in range(n_iterations):
    all_rewards , all_grads = play_multiple_episodes(env , n_episodes_per_update , n_max_steps , loss_fn)
    all_final_rewards = discount_and_normalize(all_rewards , discount_factor)
    all_mean_grad = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean([final_reward * all_grads[episode_index][step][var_index]
                                    for episode_index,final_reward in enumerate(all_final_rewards)
                                    for  step, final_reward in enumerate(final_reward)],axis=0)
        all_mean_grad.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grad, model.trainable_variables))
    

In [8]:
model.save('models/cartpole_v1.h5')

