In [25]:
import numpy as np
from scipy.linalg import eig
import copy
import math
from typing import Mapping, Set, Tuple, Sequence, Any, Callable, TypeVar
X = TypeVar('X')
A = TypeVar('A')
S = TypeVar('S')


#### Question 1
Implement Forward-View TD(Lambda) algorithm for Value Function Prediction

In [24]:
from scipy.stats import rv_discrete
def random_generator_based_on_prob(prob_dict: Mapping[S, float]) -> Callable[[], S]:
    """
    A generator of states/ state-action pairs based on predefined probabilities
    @return an instance with corresponding probabilities
    """
    outcomes, probabilities = zip(*prob_dict.items())
#     rv_discrete is a base class to construct specific distribution classes and instances 
#     for discrete random variables. It can also be used to construct an arbitrary distribution
#     defined by a list of support points and corresponding probabilities.
    rvd = rv_discrete(values=(range(len(outcomes)), probabilities))

    return lambda rvd, outcomes: outcomes[rvd.rvs(size=1)[0]]



In [5]:
# TODO: this class requires more thinking and will reconstruct it
# TODO: state_reward_gen_dict requires substatiate
# referenced from  https://github.com/coverdrive/MDP-DP-RL/
from typing import Mapping, Set, Tuple, Sequence, Any, Callable, TypeVar
X = TypeVar('X')
A = TypeVar('A')
S = TypeVar('S')

class MDP:
    """
    The MDP is defined to initalize with a state_action_dict and state_reward_dict
    The MDP is conpatible with the requirement for Monte Carlo or TD learning algorithm
    """
    def __init__(self,state_action_dict: Mapping[S, Set[A]], terminal_states: Set[S], state_reward_gen_dict, gamma: float) -> None:
        self.state_action_dict: Mapping[S, Set[A]] = state_action_dict
        self.terminal_states: Set[S] = terminal_states
        self.state_reward_gen_dict: Type1 = state_reward_gen_dict
        self.state_action_func = lambda x: self.state_action_dict[x],
        self.gamma=gamma,
        self.terminal_state_func = lambda x: x in self.terminal_states,
        self.state_reward_gen_func = lambda x, y: self.state_reward_gen_dict[x][y](),
        
        # initialize the state generator with equal probability of each state 
        self.init_state_generator = random_generator_based_pn_prob(
                {s: 1. / len(self.state_action_dict) for s
                 in self.state_action_dict.keys()}
            ),
        
        # initialize the (state, action) generator with equal probability of each pair
        self.init_state_action_generator = random_generator_based_on_prob(
                {(s, a): 1. / sum(len(v) for v
                                  in self.state_action_dict.values())
                 for s, v1 in self.state_action_dict.items() for a in v1}
        )



In [None]:
# some class prototype defined, e.g MDP and Policy
# the function definition of Policy is in accordance with https://github.com/coverdrive/MDP-DP-RL/
# with a simplified test harness
from operator import itemgetter
from typing import Mapping, Set, Tuple, Sequence, Any, Callable, TypeVar

X = TypeVar('X')
A = TypeVar('A')
S = TypeVar('S')

def epsilon_greedy(action_value_dict, epsilon: float) -> Mapping[A, float]:
    """
    Using Epsilon-Greedy method to select the action based on the value
    @return: the function returns the a dictionary of a action and its probability to be selected
    """
    max_act = max(action_value_dict.items(), key=itemgetter(1))[0]
    m = len(action_value_dict)
    if epsilon == 0:
        return {max_act: 1.}
    else:
        # with 1 / epsilon to select a random  
        return {action: epsilon / m + (1. - epsilon if a == max_act else 0.) for action in action_value_dict.keys()}




class Policy(Generic[S, A]):

    def __init__(self, data: Dict[S, Mapping[A, float]]) -> None:
        self.policy_data = data

    def get_state_probabilities(self, state: S) -> Mapping[A, float]:
        return self.policy_data[state]

    def get_state_action_probability(self, state: S, action: A) -> float:
        return self.get_state_probabilities(state).get(action, 0.)

    def edit_state_action_to_epsilon_greedy(
        self,
        state: S,
        action_value_dict: Mapping[A, float],
        epsilon: float
    ) -> None:
        self.policy_data[state] = epsilon_greedy(action_value_dict, epsilon)

    def __repr__(self):
        return self.policy_data.__repr__()

    def __str__(self):
        return self.policy_data.__str__()


In [27]:
class TDLambda:

    def __init__(self, mdp: MDP, epsilon: float, learning_rate: float, learning_rate_decay: float, \
                 lambd: float, num_episodes: int, max_steps: int):

        self.mdp = mdp,
        self.lambd = lambd
        self.epsilon = epsilon,
        self.num_episodes = num_episodes,
        self.max_steps = max_steps
        self.learning_rate: float = learning_rate
        self.gamma_lambda = self.mdp.gamma * lambd
    
    # a helper function used to calculate the G_t_lambd value for current state
    def calculate_g_t_lambd(step, g_t_list, lambd):
        total_step = len(g_t_list)
        g_t_lambd = 0
        current_coeff = 1
        for i in range(step, total_step, 1):
            g_t_lambd += current_coeff * g_t_list[i]
            current_coeff *= lamba
        return (1.0 - lambd) * g_t_lambd
    
    def get_one_TDLambda_update(self, pol: Policy, state_value_dict, action_generator_for_each_state):
        # Update MDP by running one episode of TD-lambda learning
        
        
        # preserve the state value/ action information of the current MDP
        state_action_dict = self.mdp.state_action_dict
        state_value = copy.deepcopy(state_value_dict)
        
        # generate the trace of this episode
        state_list = []
        action_list = []
        value_list = []
        
        # use the initial state generate by MDP, it could also be initialized to a fixed state
        current_state = self.mdp.init_state_generator()
        
        for step in range(self.max_steps):
            action_to_take = action_generator_for_each_state[current_state]()
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][action_to_take]()
            state_list.append(current_state)
            action_list.append(action_to_take)
            value_list.append(reward)
            if current_state in self.mdp.terminal_states:
                break
            else:
                # increment by one step
                current_state = next_state
        
        g_t_list = []
        # calculating G_t in a backward manner
        for i in range(len(state_list) - 1, -1. -1):
            if i == len(state_list) - 1:
                current_g_t = value_list[i]
            else:
                current_g_t = value_list[i] + g_t_list[-1] * self.lambd
            g_t_list.append(current_g_t)
        # reverse the g_t_list to match the time
        g_t_list = g_t_list[::-1]
        
        #  update the value function 
        for step in range(len(state_list)):
            # this time only consider the state visited in this episode
            g_t_lambd = calculate_g_t_lambd(step, g_t_list, lambd)
            state_value[state_list[step]] += self.learning_rate * (g_t_lambd - state_value[state_list[step]])
        
        return state_value
    
    # the update function running over all the episodes
    def update(self, pol:Policy):
        state_action_dict = self.mdp.state_action_dict
        # initate the value for each state as 0
        V_s = {s: 0. for s in sa_dict.keys()}
        
        # action generator for each state
        action_generator_for_each_state = {s: random_generator_based_on_prob(pol.get_state_probabilities(s))
                        for s in sa_dict.keys()}
        episodes = 0
        updates = 0

        for episode in range(self.max_episodes):
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            V_s = self.get_one_TDLambda_update(pol, copy.deepcopy(V_s), action_generator_for_each_state)

        return V_s
    