In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import *

In [2]:
# Import definitions of the environments.
import RL_worlds as worlds

# Import helper functions for plotting.
from plot_util import *

In [3]:
def epsilon_greedy(state, value, params):
    if np.random.rand() < 1 - params["epsilon"]: 
        a = argmax(value[state, :])
    else:
        a = np.random.randint(params["environment"].n_actions)
    return a

In [4]:
def init_state(params):
    """
    Initialize the state at the beginning of an episode.
    Args:
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding to the initial state.
    """
    if params['environment'].name == 'windy_cliff_grid':
        return 0
    elif params['environment'].name == 'n_armed_bandit':
        return 0
    elif params['environment'].name == 'cheese_world':
        return 0
    elif params['environment'].name == 'cliff_world':
        return 0
    elif params['environment'].name == 'quentins_world':
        return 54

def update_state(state, action, params):
    """
    State transition based on world, action and current state.
    Args:
        state: integer corresponding to the current state.
        action: integer corresponding to the action taken.
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding to the next state;
        an integer corresponding to the reward received.
    """
    next_state, reward = params['environment'].get_outcome(state, action)
    return next_state, reward
    
def call_policy(state, value, params):
    """
    Call a policy to choose actions, given current state and value function.
    Args:
        state: integer corresponding to the current state.
        value: a matrix indexed by state and action.
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding action chosen according to the policy.
    """
    # multiple options for policy
    if params['policy'] == 'epsilon_greedy':
        return epsilon_greedy(state, value, params)
    elif params['policy'] == 'softmax':
        return softmax(state, value, params)
    else: # random policy (if policy not recognized, choose randomly)
        return randint(params['environment'].n_actions)

def update_value(prev_state, action, reward, state, value, params):
    """
    Update the value function.
    Args:
        prev_state: an integer corresponding to the previous state.
        action: an integer correspoding to action taken.
        reward: a float corresponding to the reward received.
        state: an integer corresponding to the current state;
          should be None if the episode ended.
        value: a matrix indexed by state and action.
        params: a dictionary containing the default parameters. 
    Returns:
        the updated value function (matrix indexed by state and action).
    """
    if params['learning_rule'] == 'q_learning':
        # off policy learning
        return q_learning(prev_state, action, reward, state, value, params)
    elif params['learning_rule'] == 'sarsa':
        # on policy learning
        return sarsa(prev_state, action, reward, state, value, params)
    else:
        print('Learning rule not recognized')

def default_params(environment):
    """
    Define the default parameters.
    Args:
        environment: an object corresponding to the environment.
    Returns:
        a dictionary containing the default parameters, where the keys
            are strings (parameter names).
    """
    params = dict()
    params['environment'] = environment
    
    params['alpha'] = 0.1  # learning rate    
    params['beta'] = 10  # inverse temperature    
    params['policy'] = 'epsilon_greedy'
    params['epsilon'] = 0.05  # epsilon-greedy policy
    params['learning_rule'] = 'q_learning'
    params['epsilon_decay'] = 0.9
    
    if environment.name == 'windy_cliff_grid':
        params['gamma'] = 0.6  # temporal discount factor
    elif environment.name == 'n_armed_bandit':
        params['gamma'] = 0.9  # temporal discount factor
    elif environment.name == 'cliff_world':
        params['gamma'] = 1.0  # no discounting
    elif environment.name == 'cheese_world':
        params['gamma'] = 0.5  # temporal discount factor
    elif environment.name == 'quentins_world':
        params['gamma'] = 0.9  # temporal discount factor

    return params

In [5]:
def RL(params, n_episodes = 500, n_steps = 1000):
    env = params["environment"]
    value = np.zeros((env.n_states, env.n_actions))
    rewards = []
    for episode in range(n_episodes):
        state = init_state(params)
        reward_sum = 0
        for step in range(n_steps):
            action = call_policy(state, value, params)
            next_state, reward = update_state(state, action, params)
            reward_sum += reward
            value = update_value(state, action, reward, next_state, value, params)  
            state = next_state
            if next_state == None: break
        rewards.append(reward_sum)
    return value, rewards

In [7]:
import numpy as np
from scipy import sparse
from scipy.sparse import csr_matrix

In [20]:
params = default_params(worlds.cheese_world())

4

In [25]:
Q = sparse.lil_matrix((params["environment"].n_states, params["environment"].n_actions))

In [26]:
n_episodes = 100
n_steps = 10
for episode in range(n_episodes):
    state = init_state(params)
    reward_sum = 0
    for step in range(n_steps):
        action = call_policy(state, Q, params)
        next_state, reward = update_state(state, action, params)
        state = next_state
        if next_state == None: break

In [13]:
Q.

0.0