In [1]:
from typing import Mapping, Optional, Set, Callable
import numpy as np
from utils.generic_typevars import S, A
from utils.mp_funcs import get_rv_gen_func_single

### Implement Forward-View TD(Lambda) algorithm for Value Function Prediction

In [2]:
class MDPforRLTab:
    '''
        First define the MDP class
    '''
    def __init__(self, policy, actions: Mapping[S, Set[A]], terminal_states: Set[S], state_reward_gen_dict, gamma: float):
        self.policy = policy
        self.actions = actions
        self.terminal_states = terminal_states
        self.state_reward_gen_dict = state_reward_gen_dict # a dictionary of functions that generate the next state and reward
        self.gamma = gamma
    
    def get_actions(self, s):
        return self.actions[s]
    
    def get_terminal_states(self, s):
        return s in self.terminal_states
    
    def get_state_reward_gen_func(self, s, a):
        return self.state_reward_gen_dict[s][a]()
    
    def init_state_gen(self):
        dic = {}
        for s in self.actions.keys():
            dic[s] = 1. / len(self.actions)
        return get_rv_gen_func_single(dic)
    
    def init_state_action_gen(self):
        dic = {}
        for s, v1 in self.actions.items():
            for a in v1:
                dic[(s, a)] = 1. / sum(len(v) for v in self.actions.values())
                

class RLTabInterface:
    '''
    A model-free RL interface that does not need the state-transition probability model or the reward model
    '''
    
    def __init__(self, mdp_for_rl_tab: MDPforRLTab, exploring_start: bool, softmax: bool, epsilon: float, 
                 epsilon_half_life: float, num_episodes: int, max_steps: int):

        self.mdp = mdp_rep_for_rl

    # get a state-action dictionary
    def get_actions(self) -> Mapping[S, Set[A]]:
        return self.mdp.actions
    
    # check whether a state is a terminal state
    def get_terminal_states(self, s) -> bool:
        return self.mdp.get_terminal_states(s)
    
    # get a sampling of the (next state, reward) pair
    def get_next_pair(self, s, a):
        next_state, reward = self.mdp.get_state_reward_gen_func(s, a)
        

In [3]:
def generateEpisode(mdp: RLTabInterface, num_episode: int, max_steps: int, get_action: Callable[[int], int]):
    '''
    generate episodes 
    '''
    paths = []
    for i in range(num_episode):
        path = []
        cur_state = mdp.init_state_gen()
        action = get_action(cur_state)
        next_state, reward = mdp.get_state_reward_gen_func(cur_state, action)
        path.append((cur_state, action, reward))
        for j in range(max_steps):
            cur_state = next_state
            action = get_action(cur_state)
            next_state, reward = mdp.get_state_reward_gen_func(cur_state, action)
            path.append((cur_state, action, reward))
        paths.append(path)
    
    return paths 

In [4]:
def foward_pred(mdp: RLTabInterface, num_episode: int, max_steps: int, alpha: float, gamma: float, lamb: float, 
                 get_action: Callable[[int], int]):
    '''
    Forward-View TD(Lambda) algorithm for Value Function Prediction
    '''
    
    paths = generateEpisode(mdp, num_episode, max_steps, get_action)
    
    vf_pred = {s: 0.0 for s in mdp.actions.keys()}
    for i in range(num_episode):
        path = np.zeros(max_steps)
        for j in range(max_steps):
            for k in range(j, max_steps):
                path[k] += gamma ** j * paths[i][j][2]
            path[j] += gamma ** (j+1) * vf_pred(paths[i][j][0])
        lamb_ = 0
        for j in range(max_steps):
            lamb_ += (1-lamb) * lamb ** j * path[j]
        # update the value function
        vf_pred[paths[i][0][0]] = vf_pred[paths[i][0][0]] + alpha * (lamb_ - vf_pred[paths[i][0][0]])
    return vf_pred

### Implement Backward View TD(Lambda), i.e., Eligibility Traces algorithm for Value Function Prediction

In [5]:
def backward_pred(mdp: RLTabInterface, num_episode: int, max_steps: int, alpha: float, gamma: float, lamb: float, 
                   get_action: Callable[[int], int]):
    '''
    Backward-View TD(Lambda) algorithm for Value Function Prediction
    '''        
    vf_pred = np.zeros(len(tb_rl.get_states()))
    e_t = np.zeros(len(tb_rl.get_states()))
    
    for i in range(num_episode):
        cur_state = mdp.init_state_gen()
        for j in range(max_steps):
            action = get_action(cur_state)
            next_state, reward = mdp.get_state_reward_gen_func(cur_state, action)
            e_t *= lamb * gamma
            e_t[cur_state] += 1.0
            err = reward + gamma * vf_pred[s_next] - vf_pred[cur_state]
            vf_pred[cur_state] += alpha*(reward + gamma*vf_pred[next_state] - vf_pred[cur_state])
            cur_state = next_state
            vf_pred += alpha * err * e_t

    return vf_pred