In [11]:
# Importing the environment and other libraries
import numpy as np
from collections import defaultdict
import sys
import gym

env = gym.make('CartPole-v0')

# Following are the state's elements:
# Cart position, Cart Velocity, Pole Angle, Pole Velocity at Tip.
cart_pos, cart_vel, pole_ang, pole_vel =  env.reset()
print(cart_pos, cart_vel, pole_ang, pole_vel)

-0.029105537378187364 0.032860572248687256 -0.022353455871520246 -0.0439213817646438


In [12]:
def generate_episode_random(env):
    float_formatter = lambda x: "%.1f" % x
    episode = []
    state = env.reset()
    while True:
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        episode.append((tuple([float(float_formatter(i)) for i in state]), action, reward))
        state = next_state
        if done:
            break
    return episode


In [13]:
def action_values(env, num_episodes, generate_episode, gamma=1.0):
    # Initialize empty dictionaries of arrays.
    returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # Loop over episodes.
    for i_episode in range(1, num_episodes+1):
        # Monitor progress.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        
        # Generate an episode.
        episode = generate_episode(env)
        # Get states, actions and rewards from the episode.
        states, actions, rewards = zip(*episode)
        # Prepare for discounting.
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        # Update the sum of the returns, number of visits, and action-value 
        # function estimates for each state-action pair in the episode.
        for i, state in enumerate(states):
            returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(i+1)])
            N[state][actions[i]] += 1.0
            Q[state][actions[i]] = returns_sum[state][actions[i]]/N[state][actions[i]]
    return Q

In [14]:
def generate_episode_epsilon_greedy_policy(bj_env, epsilon, Q):
    float_formatter = lambda x: "%.1f" % x
    episode = []
    state = bj_env.reset()
    while True:
        state = tuple([float_formatter(i) for i in state])
        if (Q[state] == Q[state][0]).all():
            probs = [0.5]*env.action_space.n
        else:
            greedy_action_prob = (1-epsilon)+(epsilon/env.action_space.n)
            non_greedy_action_prob = (epsilon/env.action_space.n)
            probs = np.array([greedy_action_prob if i==Q[state].max() else non_greedy_action_prob for i in Q[state]])
        action = np.random.choice(np.arange(env.action_space.n), p=probs)
        next_state, reward, done, info = bj_env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

In [15]:
optimal_policy = optimal_policy_evaluation(env, 100000, gamma=1.0)

NameError: name 'optimal_policy_evaluation' is not defined

In [9]:
def simulation(env, num_episodes, policy):
    float_formatter = lambda x: "%.1f" % x
    ep_ter = []
    ep_rwd = []
    for i_episode in range(num_episodes):
        state = env.reset()
        t = 0
        total_reward = 0
        while(True):
            state = tuple([float_formatter(i) for i in state])
            env.render()
            if state in policy.keys():
                action = policy[state]
            else:
                action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            t += 1
            total_reward += reward
            if done:
                print("Episode finished after {} timesteps, total reward = {}".format(t+1, total_reward))
                ep_ter.append(t+1)
                ep_rwd.append(total_reward)
                env.close()
                break
    print("\n")
    print("Number of trials = {}".format(num_episodes))
    print("Average episode termination after {} timesteps following the given policy, average reward = {}".format
          (np.mean(ep_ter), np.mean(ep_rwd)))

In [10]:
simulation(env, 100, optimal_policy)

NameError: name 'optimal_policy' is not defined