In [1]:
# Importing the environment and other libraries
import numpy as np
from collections import defaultdict
import sys
import gym

env = gym.make('CartPole-v0')

# Following are the state's elements:
# Cart position, Cart Velocity, Pole Angle, Pole Velocity at Tip.
cart_pos, cart_vel, pole_ang, pole_vel =  env.reset()
print(cart_pos, cart_vel, pole_ang, pole_vel)

-0.02837771969829929 0.042875861838613166 0.04832027725337411 -0.012611676416141214


In [2]:
def generate_episode_random(env):
    float_formatter = lambda x: "%.1f" % x
    episode = []
    state = env.reset()
    while True:
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        episode.append((tuple([float(float_formatter(i)) for i in state]), action, reward))
        state = next_state
        if done:
            break
    return episode


In [3]:
def generate_episode_epsilon_greedy_policy(env, epsilon, Q):
    float_formatter = lambda x: "%.1f" % x
    episode = []
    state = env.reset()
    while True:
        state = tuple([float_formatter(i) for i in state])
        if (Q[state] == Q[state][0]).all():
            probs = [0.5]*env.action_space.n
        else:
            greedy_action_prob = (1-epsilon)+(epsilon/env.action_space.n)
            non_greedy_action_prob = (epsilon/env.action_space.n)
            probs = np.array([greedy_action_prob if i==Q[state].max() else non_greedy_action_prob for i in Q[state]])
        action = np.random.choice(np.arange(env.action_space.n), p=probs)
        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

In [19]:
def monte_carlo(env, num_episodes, generate_episode, gamma=1.0, eps=None):
    # Initialize empty dictionaries of arrays.
    returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # Loop over episodes.
    for i_episode in range(1, num_episodes+1):
        # Monitor progress.
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        
        epsilon = 1/((i_episode/10)+1)
        if eps != None:
            epsilon = eps
        # Generate an episode.
        episode = generate_episode(env, epsilon, Q)
        # Get states, actions and rewards from the episode.
        states, actions, rewards = zip(*episode)
        # Prepare for discounting.
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        # Update the sum of the returns, number of visits, and action-value 
        # function estimates for each state-action pair in the episode.
        for i, state in enumerate(states):
            returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(i+1)])
            N[state][actions[i]] += 1.0
            Q[state][actions[i]] = returns_sum[state][actions[i]]/N[state][actions[i]]
    
    policy = {k:np.argmax(v) for k, v in Q.items()}
    return policy

In [20]:
optimal_policy = monte_carlo(env, 10000, generate_episode_epsilon_greedy_policy, eps = 0.01)

Episode 10000/10000.

In [21]:
def simulation(env, num_episodes, policy):
    float_formatter = lambda x: "%.1f" % x
    ep_ter = []
    ep_rwd = []
    for i_episode in range(num_episodes):
        state = env.reset()
        t = 0
        total_reward = 0
        while(True):
            state = tuple([float_formatter(i) for i in state])
            env.render()
            if state in policy.keys():
                action = policy[state]
            else:
                action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            t += 1
            total_reward += reward
            if done:
                print("Episode finished after {} timesteps, total reward = {}".format(t+1, total_reward))
                ep_ter.append(t+1)
                ep_rwd.append(total_reward)
                env.close()
                break
    print("\n")
    print("Number of trials = {}".format(num_episodes))
    print("Average episode termination after {} timesteps following the given policy, average reward = {}".format
          (np.mean(ep_ter), np.mean(ep_rwd)))

In [22]:
simulation(env, 10, optimal_policy)

Episode finished after 67 timesteps, total reward = 66.0
Episode finished after 150 timesteps, total reward = 149.0
Episode finished after 113 timesteps, total reward = 112.0
Episode finished after 129 timesteps, total reward = 128.0
Episode finished after 53 timesteps, total reward = 52.0
Episode finished after 114 timesteps, total reward = 113.0
Episode finished after 85 timesteps, total reward = 84.0
Episode finished after 170 timesteps, total reward = 169.0
Episode finished after 121 timesteps, total reward = 120.0
Episode finished after 38 timesteps, total reward = 37.0


Number of trials = 10
Average episode termination after 104.0 timesteps following the given policy, average reward = 103.0
