In [1]:
from typing import Mapping, Optional, Set, Callable
import numpy as np
from utils.generic_typevars import S, A
from utils.mp_funcs import get_rv_gen_func_single

### Write code for the interface for tabular RL algorithms. The core of this interface should be a mapping from a (state, action) pair to a sampling of the (next state, reward) pair. It is important that this interface doesn't present the state-transition probability model or the reward model.

In [2]:
class MDPforRLTab:
    '''
        First define the MDP class
    '''
    def __init__(self, policy, actions: Mapping[S, Set[A]], terminal_states: Set[S], state_reward_gen_dict, gamma: float):
        self.policy = policy
        self.actions = actions
        self.terminal_states = terminal_states
        self.state_reward_gen_dict = state_reward_gen_dict # a dictionary of functions that generate the next state and reward
        self.gamma = gamma
    
    def get_actions(self, s):
        return self.actions[s]
    
    def get_terminal_states(self, s):
        return s in self.terminal_states
    
    def get_state_reward_gen_func(self, s, a):
        return self.state_reward_gen_dict[s][a]()
    
    def init_state_gen(self):
        dic = {}
        for s in self.actions.keys():
            dic[s] = 1. / len(self.actions)
        return get_rv_gen_func_single(dic)
    
    def init_state_action_gen(self):
        dic = {}
        for s, v1 in self.actions.items():
            for a in v1:
                dic[(s, a)] = 1. / sum(len(v) for v in self.actions.values())
                



In [3]:
class RLTabInterface:
    '''
    A model-free RL interface that does not need the state-transition probability model or the reward model
    '''
    
    def __init__(self, mdp_for_rl_tab: MDPforRLTab, exploring_start: bool, softmax: bool, epsilon: float, 
                 epsilon_half_life: float, num_episodes: int, max_steps: int):

        self.mdp = mdp_rep_for_rl

    # get a state-action dictionary
    def get_actions(self) -> Mapping[S, Set[A]]:
        return self.mdp.actions
    
    # check whether a state is a terminal state
    def get_terminal_states(self, s) -> bool:
        return self.mdp.get_terminal_states(s)
    
    # get a sampling of the (next state, reward) pair
    def get_next_pair(self, s, a):
        next_state, reward = self.mdp.get_state_reward_gen_func(s, a)
        

### Implement any tabular Monte-Carlo algorithm for Value Function prediction

Implement First visit Monte-Carlo algorithm, where the first time-step t that state s is visited in an episode,

In [7]:
from utils.helper_funcs import get_returns_from_rewards_terminating
from utils.helper_funcs import get_returns_from_rewards_non_terminating

class MonteCarlo:
    '''
        First visit Monte-Carlo algorithm
    '''
    def __init__(self, mdp: MDPforRLTab, num_episodes: int, max_steps: int):        
        self.mdp = mdp
        self.max_steps = max_steps
        self.num_episodes=num_episodes

    def generateEpisode(self, start_state, start_action = None):
        '''
        generate a single episode
        '''
        res = []
        state = start_state
        steps = 0
        visited = set()
        act_gen_dict = {s: get_rv_gen_func_single(self.mdp.policy[s]) for s in self.mdp.actions.keys()}

        while True:
            # check whether the state has been visited
            first = state not in visited
            visited.add(state)
            action = start_action if (start_action and steps == 0) else act_gen_dict[state]
            next_state, reward = self.mdp.state_reward_gen_dict[state][action]()
            res.append((state, action, reward, first))
            steps += 1
            if steps >= self.max_steps or state in self.mdp.terminal_states:
                break
            state = next_state
        return res

    def get_value_func_dict(self):
        '''
        predict value function
        '''        
        vf_dict = {s: 0.0 for s in self.mdp.actions.keys()}
        episodes = 0

        while episodes < self.num_episodes:
            start_state = self.mdp.init_state_gen()
            path = self.generateEpisode(start_state, start_action=None)
            
            rewards = np.array([x for _, _, x, _ in path])
            # if terminal state
            if path[-1][0] in self.mdp.terminal_states:
                returns = get_returns_from_rewards_terminating(rewards, self.mdp.gamma)
            else:
                returns = get_returns_from_rewards_non_terminating(rewards, self.mdp.gamma, self.nt_return_eval_steps)
            episodes += 1

        return vf_dict



### Implement tabular 1-step TD algorithm for Value Function prediction

In [10]:
def TD(mdp: MDPforRLTab, num_episodes, max_steps, gamma, alpha):    
    vf_dict = {s: 0.0 for s in mdp.actions.keys()}   
    mc = MonteCarlo(mdp, num_episodes, max_steps)
    
    for i in range(num_episodes):
        start_state = mdp.init_state_gen()
        episode = mc.generateEpisode(start_state, start_action = None)
        for j in range(max_steps):
            cur_state = episode[j][0]
            reward = episode[j][2]
            # get the next state
            if j < max_steps-1:
                next_state = episode[j+1][0]
            else:
                next_state = episode[0][0]
            # get the value function
            vf_dict[cur_state] += alpha*(reward + gamma*val[s_next] - val[s_cur])

    return vf_dict


### Test the above implementation of Monte-Carlo and TD Value Function prediction algorithms versus DP Policy Evaluation algorithm on an example MDP

In [13]:
from utils.processes.mp_funcs import get_state_reward_gen_dict

actions = {0:[0,1,2], 1:[0,1,2], 2:[0,1,2]}
transitions =  {0: {
            0: {0: 0.3, 1: 0.6, 2: 0.1},
            1: {0: 0.2, 1: 0.4, 2: 0.4},
            2: {0: 0.1, 1: 0.5, 2: 0.4}
        },
        1: {
            0: {0: 0.4, 1: 0.5, 2: 0.1},
            1: {0: 0.2, 1: 0.5, 2: 0.3},
            2: {0: 0.3, 1: 0.2, 2: 0.5}
        },
        2: {
            0: {0: 0.3, 1: 0.3, 2: 0.4},
            1: {0: 0.2, 1: 0.6, 2: 0.2},
            2: {0: 0.3, 1: 0.4, 2: 0.3}
        }}
rewards = {0: {
            0: {0: 1.0, 1: 2.0, 2: 3.0},
            1: {0: 2.0, 1: 3.0, 2: 1.0},
            2: {0: 3.0, 1: 2.0, 2: 1.0}
        },
        1: {
            0: {0: 2.0, 1: 2.4, 2: 3.5},
            1: {0: 4.0, 1: 3.9, 2: 5.0},
            2: {0: 3.2, 1: 7.0, 2: -2.0}
        },
        2: {
            0: {0: 4.0, 1: 2.5, 2: -2.0},
            1: {0: 3.8, 1: 4.0, 2: 1.6},
            2: {0: 3.1, 1: -2.0, 2: 1.5}
        }}
policy = {
        0: {0: 0.5, 1: 0.1, 2: 0.4},
        1: {0: 0.4, 2: 0.6},
        2: {1: 1.0}
    }
alpha = 0.1
gamma = 1
state_reward_gen_dict = get_state_reward_gen_dict(transitions, rewards)
num_episodes = 10
max_steps = 10

mdp = MDPforRLTab(policy, actions, set(), state_reward_gen_dict, gamma)
print(TD(mdp, num_episodes, max_steps, gamma, alpha))

{0: 6.516608108123774, 1: 6.991429305383525, 2: 8.610530839162942}


### Prove that fixed learning rate (step size alpha) for MC is equivalent to an exponentially decaying average of episode returns

For $k^{th}$ update,
$$ \begin{split}
V^k(S_t) &= V^{k-1}(S_t) + \alpha (G^{k-1}_t -V^{k-1}(S_t) )\\
&= (1-\alpha)V^{k-1}(S_t)+\alpha G^{k-1}_t \\
&= (1-\alpha)((1-\alpha)V^{k-2}(S_t)+ \alpha G^{k-2}_t)+ \alpha G^{k-1}_t \\
&= \alpha G^{k-1}_t + (1-\alpha)\alpha G^{k-2}_t + ... +(1-\alpha)^{k-1} \alpha G^{0}_t + (1-\alpha)^{k} V^{0}(S_t)
\end{split}$$ 
Therefore, it is equivalent to an exponentially decaying average of episode returns.