### Assignment 11

#### Question 1
Write code for the interface for RL algorithms with value function approximation. The core of this interface should be a function from a (state, action) pair to a sampling of the (next state, reward) pair. It is important that this interface doesn't present the state-transition probability model or the reward model.

In [7]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence, Callable, Tuple, Dict
import torch

In [11]:
from torch import nn
X = TypeVar('X')
A = TypeVar('A')
S = TypeVar('S')

In [9]:
# Policy approximation for Pi function
class Pi_NN(nn.Module):
    def __init__(self, input_size, state_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, state_size, bias = True)
        )
        self.softmax = torch.nn.Softmax()
    def forward(self, feature):
        out = self.approximator(feature)
        out = self.softmax(out)
        return out

# Function approximation for the Q function     
class Q_NN(nn.Module):
    def __init__(self, input_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, 1, bias = True)
        )
    def forward(self, feature):
        out = self.approximator(feature)
        return out

# Function approximation for V function    
class V_NN(nn.Module):
    def __init__(self, input_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, 1, bias = True)
        )
    def forward(self, feature):
        out = self.approximator(feature)
        return out
            

In [14]:
class Policy:
    """
    The stochasic policy that maps a state to the transition probability
    The policy is a mapping from a state to an action, i.e. given a state, figuring out the probability of selecting some actions
    """
    def __init__(self, data: Dict[S, Mapping[A, float]]) -> None:
        self.data = data

    def get_state_probabilities(self, state: S) -> Mapping[A, float]:
        return self.data[state]

    def get_state_action_probability(self, state: S, action: A) -> float:
        return self.get_state_probabilities(state).get(action, 0.)

    # update state/ action probability using epsilon greedy
    def update_state_action_to_epsilon_greedy(self, state: S, action_value_dict: Mapping[A, float], epsilon: float):
        max_act = max(action_value_dict.items(), key=itemgetter(1))[0]
        if epsilon == 0:
            ret = {max_act: 1.}
        else:
            ret = {a: epsilon / len(action_value_dict) + (1. - epsilon if a == max_act else 0.) for a in action_value_dict.keys()}
        self.policy_data[state] = ret

    def __repr__(self):
        return self.policy_data.__repr__()

    def __str__(self):
        return self.policy_data.__str__()

class RLFuncApproxBase:

    NUM_SAMPLES_PER_ACTION = 10

    def __init__(self, mdp_rep_for_rl, epsilon: float, num_episodes: int, max_steps: int, fa_spec):

        self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl
        
        # TODO: epsilon with decay should be allowed
        self.epsilon = epsilon
        self.num_episodes: int = num_episodes
        self.max_steps: int = max_steps
        self.vf_fa: FuncApproxBase = V_NN(len(self.mdp_rep.states))
        self.qvf_fa: FuncApproxBase = Q_NN(len(self.mdp_rep.states))
        # the simulator of getting the next, given the current state and action
        self.state_action_func = self.mdp_rep.state_action_func

    def get_init_policy_func(self):
        return Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()})

    def get_value_func_fa(self, polf):
        qv_func = self.vf_fa # the function approximation of the reward in each state
        # return the expectation of current states
        return sum(polf(s)[a] * qv_func(s)(a) for a in self.state_action_func(s))


    def get_value_func(self, pol_func):
        return self.get_value_func_fa(lambda s, pol_func=pol_func: get_pdf_from_samples(pol_func(s)(len(self.state_action_func(s)) * RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)))


    def get_act_value_func(self, pol_func):
        # a weighted averagex
        return self.get_qv_func_fa(
            lambda s, pol_func=pol_func: get_pdf_from_samples(
                pol_func(s)(len(self.state_action_func(s)) *
                            RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)
            )
        )

    def get_optimal_det_policy_func(self):
        qv_func = self.get_qv_func_fa(None)

        # noinspection PyShadowingNames
        def detp_func(s: S, qv_func=qv_func) -> A:
            return max(
                [(a, qv_func(s)(a)) for a in self.state_action_func(s)],
                key=itemgetter(1)
            )[0]

        return detp_func


In [None]:
# the functon to train the neural network to converge to the value provided

def train(model, data, value, number_epochs, learning_rate = 1e-3):
    # the data passed in should be a torch tensor
    cross_entrophy_loss = nn.CrossEntropyLoss(reduce=True)
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    # define the loss of the model
    num_of_samples, _ = data.size()
    for epc in range(number_epochs):
        # use batch size of 1 for brevity
        for i in range(num_of_samples):
            sample = data[i, :] # the current sample used to train the model
            y_hat = model(sampel)
            y = value[i]
            loss = cross_entrophy_loss(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    torch.save(model.state_dict(), os.path.join('best_model', 'model.pt'))

In terms of the function approximation, linear value function approximation could be used and the update is
$${\nabla_w\hat{v}(S,w) = x(S)}$$
$${\Delta w = \alpha(v_\pi(S) - \hat{v}(S, w))x(S)}$$

#### Question 2:
Implement any Monte-Carlo Prediction algorithm with Value Function approximation

In [17]:
# Return G_t is unbiased, noisy sample of true value v_pi(S_t)
def MCwithFA(state_list, value_list, hidden_size, alpha = 1e-3, gamma = 1, init_w = None):
    # hidden_size the feature representation of a state
    if len(state_list) != len(value_list):
        raise ValueError('The number of the states and the number of values should be the same')
    if init_w is not None:
        w = init_w
    else:
        w = np.random.rand(hidden_size)
    steps = len(state_list) # total steps in one episode
    for step in range(steps):
        current_state = state_list[step]
        current_value = value_list[step]
        v_hat = np.dot(w, current_state)
        dw = alpha * (current_value - v_hat)*current_state
        w = w - dw
    return w

In [21]:
# an example of a very simple classifier
# where the state is represented as a vector of features
a = np.array([1, 1, 1, 1])
b = np.array([0, 0, 0, 0])
state_list = [a, b]
value_list = [1, 0]

w = MCwithFA(state_list, value_list, 4)
print (w)

[0.42031383 0.32683631 0.02448432 0.90036345]


#### Question 3
Implement 1-step TD Prediction algorithm with Value Function approximation

In [22]:
# similarly we are using linear functio approximation to estimate the value of the state
def TD0withFA(state_list, value_list, hidden_size, alpha = 1e-3, gamma = 1, init_w = None):
    # hidden_size the feature representation of a state
    if len(state_list) != len(value_list):
        raise ValueError('The number of the states and the number of values should be the same')
    if init_w is not None:
        w = init_w
    else:
        w = np.random.rand(hidden_size)
    steps = len(state_list - 1) # total steps in one episode
    for step in range(steps):
        current_state = state_list[step]
        current_value = value_list[step] + np.dot(w, state_list[step + 1]) # R + one-step looking ahead
        v_hat = np.dot(w, current_state)
        dw = alpha * (current_value - v_hat)*current_state
        w = w - dw
    return w

In [24]:
# an example using TD(0) and linear function approximation to estimate the value function of each state
a = np.array([1, 1, 1, 1])
b = np.array([0, 0, 0, 0])
c = np.array([1, 1, 1, 0])
d = np.array([1, 1, 0, 0])
e = np.array([1, 0, 0, 0])
state_list = [a, b, c, d, e]
value_list = [1, 0, 0, 0, 0]

w = MCwithFA(state_list, value_list, 4, alpha = 1e-1)
print (w)

[0.82438798 0.97422517 1.32078388 0.96470432]


#### Question 4
Implement Eligibility-Traces-based TD(lambda) Prediction algorithm with Value Function approximation

In [25]:
# Backward view TD(lambda) algorithm with eligibility trace
# similarly we are using linear functio approximation to estimate the value of the state
def getGLambd(value_list, alpha, lambd, gamma, value = None):
    if value is None:
        value = np.zeros(len(value_list))
    # the lambd return combines all n-step returns
    # defining n step returns
    G = []
    for i in range(len(value_list)):
        # for each i, calculating the n step return separately as a list
        G_t = []
        for j in range(i, len(value_list)):
            # defining the edge cases
            if j == 0:
                g_t_j = value_list[j] + gamma * value[j + 1]
            if j == len(value_list) - 1:
                g_t_j = G_t[-1]
            g_t_j = G_t[-1] + gamma * value[j + 1]
            G_t.append(g_t_j)
        G.append(G_t)
    # G is the the value of each state, and each value of G is a list, having all the G_t_n values
    return G
    
# the lambda return is also a biased sample of true value
def TDlambdawithFA(state_list, value_list, hidden_size, alpha = 1e-3, lambd = 0.7, gamma = 1, init_w = None):
    # hidden_size the feature representation of a state
    
    # getting G_t_n
    G = getGLambd(value_list, alpha, lambd, gamma)
    
    # getting G_t_lambd
    
    G_t_lambd = []
    for step in range(len(G)):
        p = 1 # indicator variable to calculating the lambd to the power of n
        current_n_value = G[step]
        current_G_t_lambd = 0
        for g_value in current_n_value:
            current_G_t_lambd += p * g_value
            p *= lambd
        # the final result should have a factor of 1 - lambd
        G_t_lambd.append((1 - lambd) * current_G_t_lambd)
        
    # using the G_t_lambd as the value function to update the value of each state
    if len(state_list) != len(value_list):
        raise ValueError('The number of the states and the number of values should be the same')
    if init_w is not None:
        w = init_w
    else:
        w = np.random.rand(hidden_size)
    steps = len(state_list - 1) # total steps in one episode
    
    # constructing the E matrix with the number of states and the number of features
    E_t = np.zeros(steps, hidden_size)
    
    for step in range(steps):
        current_state = state_list[step]
        current_value = G_t_lambd[step] + np.dot(w, state_list[step + 1]) # R + one-step looking ahead
        v_hat = np.dot(w, current_state)
        delta_t = alpha * (current_value - v_hat)*current_state
        E = gamma * lambd * E
        E[step] += current_state
        dw = alpha * delta_t * E[step]
        w = w - dw
    return w

#### Question 5
Implement SARSA and SARSA(Lambda) with Value Function approximation

In [26]:
# use some code from last assignment


def random_generator_based_on_prob(prob_dict: Mapping[S, float]) -> Callable[[], S]:
    outcomes, probabilities = zip(*prob_dict.items())
    rvd = rv_discrete(values=(range(len(outcomes)), probabilities))

    return lambda rvd=rvd, outcomes=outcomes: outcomes[rvd.rvs(size=1)[0]]

class MDP:
    """
    The MDP is defined to initalize with a state_action_dict and state_reward_dict
    The MDP is conpatible with the requirement for Monte Carlo or TD learning algorithm
    """
    def __init__(self,state_action_dict: Mapping[S, Set[A]], terminal_states: Set[S], state_reward_gen_dict, gamma: float) -> None:
        
        # a mapping from a tuple of (state, action) to the reward
        self.state_action_dict: Mapping[S, Set[A]] = state_action_dict
        
        self.terminal_states: Set[S] = terminal_states
        self.state_reward_gen_dict: Type1 = state_reward_gen_dict
        self.state_action_func = lambda x: self.state_action_dict[x],
        self.gamma=gamma,
        self.terminal_state_func = lambda x: x in self.terminal_states,
        self.state_reward_gen_func = lambda x, y: self.state_reward_gen_dict[x][y](),
        
        # initialize the state generator with equal probability of each state 
        self.init_state_generator = random_generator_based_pn_prob(
                {s: 1. / len(self.state_action_dict) for s
                 in self.state_action_dict.keys()}
            ),
        
        # initialize the (state, action) generator with equal probability of each pair
        self.init_state_action_generator = random_generator_based_on_prob(
                {(s, a): 1. / sum(len(v) for v
                                  in self.state_action_dict.values())
                 for s, v1 in self.state_action_dict.items() for a in v1}
        )

from operator import itemgetter
from typing import Mapping, Set, Tuple, Sequence, Any, Callable, TypeVar, Dict

def epsilon_greedy(action_value_dict, epsilon: float) -> Mapping[A, float]:
    """
    Using Epsilon-Greedy method to select the action based on the value
    @return: the function returns the a dictionary of a action and its probability to be selected
    """
    max_act = max(action_value_dict.items(), key=itemgetter(1))[0]
    m = len(action_value_dict)
    if epsilon == 0:
        return {max_act: 1.}
    else:
        # with 1 / epsilon to select a random  
        return {action: epsilon / m + (1. - epsilon if a == max_act else 0.) for action in action_value_dict.keys()}




class Policy:

    def __init__(self, data: Dict[S, Mapping[A, float]]) -> None:
        self.policy_data = data

    def get_state_probabilities(self, state: S) -> Mapping[A, float]:
        return self.policy_data[state]
    
    def update_state_action_to_greedy(self, state, action_value_dict):
        self.policy_data[state] = {max(action_value_dict.item(), key=itemgetter(1))[0]:1.0}

    def get_state_action_probability(self, state: S, action: A) -> float:
        return self.get_state_probabilities(state).get(action, 0.)

    def edit_state_action_to_epsilon_greedy(
        self,
        state: S,
        action_value_dict: Mapping[A, float],
        epsilon: float
    ) -> None:
        self.policy_data[state] = self.epsilon_greedy(action_value_dict, epsilon)
    
    def __repr__(self):
        return self.policy_data.__repr__()

    def __str__(self):
        return self.policy_data.__str__()


class SARSA:

    def __init__(self, mdp: MDP, epsilon: float, learning_rate: float, learning_rate_decay: float, \
                 lambd: float, num_episodes: int, max_steps: int, initial_policy: Policy):

        self.mdp = mdp,
        self.lambd = lambd
        self.epsilon = epsilon,
        self.num_episodes = num_episodes,
        self.max_steps = max_steps
        self.learning_rate: float = learning_rate
        self.gamma_lambda = self.mdp.gamma * lambd
        
        self.policy = initial_policy
    
    def epsilon_greedy(self, policy, current_state, state_value_dict, epsilon):
        # the probability of selecting certain action based on the policy
        actions = policy.get_state_probabilities(current_state)
        
        # the action chosen based on greedy algorithm
        max_act = current_state
        for action in actions:
            if state_value_dict[current_state][action] > state_value_dict[current_state][max_act]:
                max_act = action
        
        m = len(action_value_dict)
        policy.edit_state_action_to_epsilon_greedy(current_state, self.mdp.action_value_dict)
        if epsilon == 0:
            return {max_act: 1.}
        else:
            # with 1 / epsilon to select a random  
            return {action: epsilon / m + (1. - epsilon if a == max_act else 0.) for action in action_value_dict.keys()}
        
        
    def update_one_episode(self, current_state, current_action, state_action_value_dict, policy):
        # before updating using SARSA
        curt_state = current_state
        online_state_value_dict = copy.deepcopy(state_action_value_dict)
        curt_action = current_action
        for step in range(self.max_steps):
            # generate next step based on epsilon-greedy
            actions = self.epsilon_greedy(policy, current_state, online_state_value_dict, self.epsilon)
            # here the next action to take is based on the probability generated by the epsilon greedy
            next_action_generator = random_generator_based_on_prob(actions)
            action_to_take = next_action_generator()
            
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][curt_action]()
            
            # online update for Q(S,A)
            online_state_value_dict[curt_state][curt_action] = online_state_value_dict[curt_state][curt_action] + \
            self.learning_rate * (reward + self.gamma * online_state_value_dict[next_state][action_to_take] - online_state_value_dict[curt_state][curt_action])
            if curt_state in self.mdp.terminal_states:
                break
            else:
                curt_state = next_state
                curt_action = action_to_take
        # update the policy and the Q(S,A) accordingly
        return (policy, online_state_value_dict)
    
    def SARSA_update(self, pol:Policy, inital_state, intial_action):
        """
        Update the policy and Q(S,A) using SARSA online update
        """
        
        state_action_value_dict = self.mdp.state_action_dict
        

        for episode in range(self.max_episodes):
            current_state = initial_state
            current_action = initial_action
            (policy, state_action_value_dict) = self.update_one_episode(current_state, \
                                                                        current_action, \
                                                                        copy.deepcopy(state_action_value_dict), \
                                                                        copy.deepcopy(policy))
            

        return (policy, state_action_value_dict)
    
    
    # a helper function used to calculate the G_t_lambd value for current state
    def calculate_g_t_lambd(step, g_t_list, lambd):
        total_step = len(g_t_list)
        g_t_lambd = 0
        current_coeff = 1
        for i in range(step, total_step, 1):
            g_t_lambd += current_coeff * g_t_list[i]
            current_coeff *= lamba
        return (1.0 - lambd) * g_t_lambd
    
    def get_one_TDLambda_update(self, pol: Policy, state_value_dict, action_generator_for_each_state):
        # Update MDP by running one episode of TD-lambda learning
        
        
        # preserve the state value/ action information of the current MDP
        state_action_dict = self.mdp.state_action_dict
        state_value = copy.deepcopy(state_value_dict)
        
        # generate the trace of this episode
        state_list = []
        action_list = []
        value_list = []
        
        # use the initial state generate by MDP, it could also be initialized to a fixed state
        current_state = self.mdp.init_state_generator()
        
        for step in range(self.max_steps):
            action_to_take = action_generator_for_each_state[current_state]()
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][action_to_take]()
            state_list.append(current_state)
            action_list.append(action_to_take)
            value_list.append(reward)
            if current_state in self.mdp.terminal_states:
                break
            else:
                # increment by one step
                current_state = next_state
        
        g_t_list = []
        # calculating G_t in a backward manner
        for i in range(len(state_list) - 1, -1. -1):
            if i == len(state_list) - 1:
                current_g_t = value_list[i]
            else:
                current_g_t = value_list[i] + g_t_list[-1] * self.lambd
            g_t_list.append(current_g_t)
        # reverse the g_t_list to match the time
        g_t_list = g_t_list[::-1]
        
        #  update the value function 
        for step in range(len(state_list)):
            # this time only consider the state visited in this episode
            g_t_lambd = calculate_g_t_lambd(step, g_t_list, lambd)
            state_value[state_list[step]] += self.learning_rate * (g_t_lambd - state_value[state_list[step]])
        
        return state_value
    
    # the update function running over all the episodes
    def update(self, pol:Policy):
        state_action_dict = self.mdp.state_action_dict
        # initate the value for each state as 0
        V_s = {s: 0. for s in sa_dict.keys()}
        
        # action generator for each state
        action_generator_for_each_state = {s: random_generator_based_on_prob(pol.get_state_probabilities(s))
                        for s in sa_dict.keys()}
        episodes = 0
        updates = 0

        for episode in range(self.max_episodes):
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            V_s = self.get_one_TDLambda_update(pol, copy.deepcopy(V_s), action_generator_for_each_state)

        return V_s
    

In [27]:
# change state_value[state_list[step]] += self.learning_rate * (g_t_lambd - state_value[state_list[step]]) to
# update w instead of state probability

# the idea behind is very similar, but what is differnent is that we no longer change the state probability directly,
# and instead we are using w to approximate each value appreared

class SARSAwithFunctionApproximation:

    def __init__(self, mdp: MDP, epsilon: float, learning_rate: float, learning_rate_decay: float, \
                 lambd: float, num_episodes: int, max_steps: int, initial_policy: Policy, hidden_size: int, w = None):

        self.mdp = mdp,
        self.lambd = lambd
        self.epsilon = epsilon,
        self.num_episodes = num_episodes,
        self.max_steps = max_steps
        self.learning_rate: float = learning_rate
        self.gamma_lambda = self.mdp.gamma * lambd
        
        self.policy = initial_policy
        
        # initiate the w used to estimate the values
        if w is None:
            self.w = np.random.rand(hidden_size)
        else:
            self.w = w
            
        self.hidden_size = hidden_size
        
    def getGLambd(value_list, alpha, lambd, gamma, value = None):
        if value is None:
            value = np.zeros(len(value_list))
        # the lambd return combines all n-step returns
        # defining n step returns
        G = []
        for i in range(len(value_list)):
            # for each i, calculating the n step return separately as a list
            G_t = []
            for j in range(i, len(value_list)):
                # defining the edge cases
                if j == 0:
                    g_t_j = value_list[j] + gamma * value[j + 1]
                if j == len(value_list) - 1:
                    g_t_j = G_t[-1]
                g_t_j = G_t[-1] + gamma * value[j + 1]
                G_t.append(g_t_j)
            G.append(G_t)
        # G is the the value of each state, and each value of G is a list, having all the G_t_n values
        return G

    # the lambda return is also a biased sample of true value
    def TDlambdawithFA(state_list, value_list, hidden_size, alpha = 1e-3, lambd = 0.7, gamma = 1, init_w = None):
        # hidden_size the feature representation of a state

        # getting G_t_n
        G = getGLambd(value_list, alpha, lambd, gamma)

        # getting G_t_lambd

        G_t_lambd = []
        for step in range(len(G)):
            p = 1 # indicator variable to calculating the lambd to the power of n
            current_n_value = G[step]
            current_G_t_lambd = 0
            for g_value in current_n_value:
                current_G_t_lambd += p * g_value
                p *= lambd
            # the final result should have a factor of 1 - lambd
            G_t_lambd.append((1 - lambd) * current_G_t_lambd)

        # using the G_t_lambd as the value function to update the value of each state
        if len(state_list) != len(value_list):
            raise ValueError('The number of the states and the number of values should be the same')
        if init_w is not None:
            w = init_w
        else:
            w = np.random.rand(hidden_size)
        steps = len(state_list - 1) # total steps in one episode

        # constructing the E matrix with the number of states and the number of features
        E_t = np.zeros(hidden_size)

        for step in range(steps):
            current_state = state_list[step]
            current_value = G_t_lambd[step] + np.dot(w, state_list[step + 1]) # R + one-step looking ahead
            v_hat = np.dot(w, current_state)
            delta_t = alpha * (current_value - v_hat)*current_state
            E = gamma * lambd * E
            E += current_state
            dw = alpha * delta_t * E
            w = w - dw
        return w
    
    def epsilon_greedy(self, policy, current_state, state_value_dict, epsilon):
        # the probability of selecting certain action based on the policy
        actions = policy.get_state_probabilities(current_state)
        
        # the action chosen based on greedy algorithm
        max_act = current_state
        for action in actions:
            if state_value_dict[current_state][action] > state_value_dict[current_state][max_act]:
                max_act = action
        
        m = len(action_value_dict)
        policy.edit_state_action_to_epsilon_greedy(current_state, self.mdp.action_value_dict)
        if epsilon == 0:
            return {max_act: 1.}
        else:
            # with 1 / epsilon to select a random  
            return {action: epsilon / m + (1. - epsilon if a == max_act else 0.) for action in action_value_dict.keys()}
        
        
    def update_one_episode(self, current_state, current_action, state_action_value_dict, policy):
        # before updating using SARSA
        curt_state = current_state
        online_state_value_dict = copy.deepcopy(state_action_value_dict)
        curt_action = current_action
        for step in range(self.max_steps):
            # generate next step based on epsilon-greedy
            actions = self.epsilon_greedy(policy, current_state, online_state_value_dict, self.epsilon)
            # here the next action to take is based on the probability generated by the epsilon greedy
            next_action_generator = random_generator_based_on_prob(actions)
            action_to_take = next_action_generator()
            
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][curt_action]()
            
            # online update for Q(S,A)
            online_state_value_dict[curt_state][curt_action] = online_state_value_dict[curt_state][curt_action] + \
            self.learning_rate * (reward + self.gamma * online_state_value_dict[next_state][action_to_take] - online_state_value_dict[curt_state][curt_action])
            if curt_state in self.mdp.terminal_states:
                break
            else:
                curt_state = next_state
                curt_action = action_to_take
        # update the policy and the Q(S,A) accordingly
        return (policy, online_state_value_dict)
    
    def SARSA_update(self, pol:Policy, inital_state, intial_action):
        """
        Update the policy and Q(S,A) using SARSA online update
        """
        
        state_action_value_dict = self.mdp.state_action_dict
        

        for episode in range(self.max_episodes):
            current_state = initial_state
            current_action = initial_action
            (policy, state_action_value_dict) = self.update_one_episode(current_state, \
                                                                        current_action, \
                                                                        copy.deepcopy(state_action_value_dict), \
                                                                        copy.deepcopy(policy))
            

        return (policy, state_action_value_dict)
    
    
    # a helper function used to calculate the G_t_lambd value for current state
    def calculate_g_t_lambd(step, g_t_list, lambd):
        total_step = len(g_t_list)
        g_t_lambd = 0
        current_coeff = 1
        for i in range(step, total_step, 1):
            g_t_lambd += current_coeff * g_t_list[i]
            current_coeff *= lamba
        return (1.0 - lambd) * g_t_lambd
    
    def get_one_TDLambda_update(self, pol: Policy, state_value_dict, action_generator_for_each_state):
        # Update MDP by running one episode of TD-lambda learning
        
        
        # preserve the state value/ action information of the current MDP
        state_action_dict = self.mdp.state_action_dict
        state_value = copy.deepcopy(state_value_dict)
        
        # generate the trace of this episode
        state_list = []
        action_list = []
        value_list = []
        
        # use the initial state generate by MDP, it could also be initialized to a fixed state
        current_state = self.mdp.init_state_generator()
        
        for step in range(self.max_steps):
            action_to_take = action_generator_for_each_state[current_state]()
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][action_to_take]()
            state_list.append(current_state)
            action_list.append(action_to_take)
            value_list.append(reward)
            if current_state in self.mdp.terminal_states:
                break
            else:
                # increment by one step
                current_state = next_state
        
        g_t_list = []
        # calculating G_t in a backward manner
        for i in range(len(state_list) - 1, -1. -1):
            if i == len(state_list) - 1:
                current_g_t = value_list[i]
            else:
                current_g_t = value_list[i] + g_t_list[-1] * self.lambd
            g_t_list.append(current_g_t)
        # reverse the g_t_list to match the time
        g_t_list = g_t_list[::-1]
        
        num_of_steps = len(state_list)
        E = np.zeros((num_of_steps, hidden_size))
        #  update the value function and at the same time update the w
        for step in range(len(state_list)):
            # this time only consider the state visited in this episode
            g_t_lambd = calculate_g_t_lambd(step, g_t_list, lambd)
            state_value[state_list[step]] += self.learning_rate * (g_t_lambd - state_value[state_list[step]])
            
            # updating the w value at the same time
            current_state = state_list[step]
#             current_value = G_t_lambd[step] + np.dot(w, state_list[step + 1]) # R + one-step looking ahead
            
            v_hat = np.dot(self.w, current_state)
            delta_t = alpha * (current_value - v_hat)*current_state
            E = gamma * lambd * E
            E[step] += current_state
            dw = alpha * delta_t * E[step]
            w = self.w - dw
            self.w = w
        
        return state_value
    
    # the update function running over all the episodes
    def update(self, pol:Policy):
        state_action_dict = self.mdp.state_action_dict
        # initate the value for each state as 0
        V_s = {s: 0. for s in sa_dict.keys()}
        
        # action generator for each state
        action_generator_for_each_state = {s: random_generator_based_on_prob(pol.get_state_probabilities(s))
                        for s in sa_dict.keys()}
        episodes = 0
        updates = 0

        for episode in range(self.max_episodes):
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            V_s = self.get_one_TDLambda_update(pol, copy.deepcopy(V_s), action_generator_for_each_state)

        return V_s
    



#### Question 6
Implement Q-Learning with Value Function approximation

In [4]:
# the idea is similar, by the time updating the value function, using function approximation
# rather than the tabular settings from the MDP
class Q_leaning:

    def __init__(self, mdp, epsilon: float, learning_rate: float, learning_rate_decay: float, \
                 lambd: float, num_episodes: int, max_steps: int, initial_policy, hidden_size, w = None):

        self.mdp = mdp,
        self.lambd = lambd
        self.epsilon = epsilon,
        self.num_episodes = num_episodes,
        self.max_steps = max_steps
        self.learning_rate: float = learning_rate
        self.gamma_lambda = self.mdp.gamma * lambd
        self.policy = initial_policy
        if w is None:
            self.w = np.random.rand(hidden_size)
        else:
            self.w = w
            
        self.hidden_size = hidden_size
    
    def epsilon_greedy(self, policy, current_state, state_value_dict, epsilon):
        # the probability of selecting certain action based on the policy
        actions = policy.get_state_probabilities(current_state)
        
        # the action chosen based on greedy algorithm
        max_act = current_state
        for action in actions:
            if state_value_dict[current_state][action] > state_value_dict[current_state][max_act]:
                max_act = action
        
        m = len(action_value_dict)
        policy.edit_state_action_to_epsilon_greedy(current_state, self.mdp.action_value_dict)
        if epsilon == 0:
            return {max_act: 1.}
        else:
            # with 1 / epsilon to select a random  
            return {action: epsilon / m + (1. - epsilon if a == max_act else 0.) for action in action_value_dict.keys()}
    
    # compared with SARSA, Q-learning has a separate method to choose next_action based on greedy policy
    def Q_learning_greedy(self, policy, current_state, state_value_dict):
        # the probability of selecting certain action based on the policy
        actions = policy.get_state_probabilities(current_state)
        
        # the action chosen based on greedy algorithm
        max_act = current_state
        for action in actions:
            if state_value_dict[current_state][action] > state_value_dict[current_state][max_act]:
                max_act = action
        
        policy = policy.update_state_action_to_greedy(current_state, self.mdp.action_value_dict)
        return {max_act: 1.}

        
    def update_one_episode(self, current_state, current_action, state_action_value_dict, policy):
        # Use Q leaning to update the state_action_dictiary and change the policy
        curt_state = current_state
        online_state_value_dict = copy.deepcopy(state_action_value_dict)
        curt_action = current_action
        for step in range(self.max_steps):
            # generate next step based on epsilon-greedy
            actions = self.Q_learning_greedy(policy, current_state, online_state_value_dict)
            # here the next action to take is based on the probability generated by the epsilon greedy
            next_action_generator = random_generator_based_on_prob(actions)
            action_to_take = next_action_generator()
            
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][curt_action]()
            
            # online update for Q(S,A)
            online_state_value_dict[curt_state][curt_action] = online_state_value_dict[curt_state][curt_action] + \
            self.learning_rate * (reward + self.gamma * online_state_value_dict[next_state][action_to_take] - online_state_value_dict[curt_state][curt_action])
            if curt_state in self.mdp.terminal_states:
                break
            else:
                curt_state = next_state
                curt_action = action_to_take
        # update the policy and the Q(S,A) accordingly
        return (policy, online_state_value_dict)
    
    def Q_leaning_update(self, pol:Policy, inital_state, intial_action):
        """
        Update the policy and Q(S,A) using SARSA online update
        """
        
        state_action_value_dict = self.mdp.state_action_dict
        

        for episode in range(self.max_episodes):
            current_state = initial_state
            current_action = initial_action
            (policy, state_action_value_dict) = self.update_one_episode(current_state, \
                                                                        current_action, \
                                                                        copy.deepcopy(state_action_value_dict), \
                                                                        copy.deepcopy(policy))
            

        return (policy, state_action_value_dict)
    
    
    # a helper function used to calculate the G_t_lambd value for current state
    def calculate_g_t_lambd(step, g_t_list, lambd):
        total_step = len(g_t_list)
        g_t_lambd = 0
        current_coeff = 1
        for i in range(step, total_step, 1):
            g_t_lambd += current_coeff * g_t_list[i]
            current_coeff *= lamba
        return (1.0 - lambd) * g_t_lambd
    
    def get_one_update(self, pol: Policy, state_value_dict, action_generator_for_each_state):
        # Update MDP by running one episode of TD-lambda learning
        
        
        # preserve the state value/ action information of the current MDP
        state_action_dict = self.mdp.state_action_dict
        state_value = copy.deepcopy(state_value_dict)
        
        # generate the trace of this episode
        state_list = []
        action_list = []
        value_list = []
        
        # use the initial state generate by MDP, it could also be initialized to a fixed state
        current_state = self.mdp.init_state_generator()
        
        for step in range(self.max_steps):
            action_to_take = action_generator_for_each_state[current_state]()
            next_state, reward = self.mdp.state_reward_gen_dict[current_state][action_to_take]()
            state_list.append(current_state)
            action_list.append(action_to_take)
            value_list.append(reward)
            if current_state in self.mdp.terminal_states:
                break
            else:
                # increment by one step
                current_state = next_state
        
        g_t_list = []
        # calculating G_t in a backward manner
        for i in range(len(state_list) - 1, -1. -1):
            if i == len(state_list) - 1:
                current_g_t = value_list[i]
            else:
                current_g_t = value_list[i] + g_t_list[-1] * self.lambd
            g_t_list.append(current_g_t)
        # reverse the g_t_list to match the time
        g_t_list = g_t_list[::-1]
        E = np.zeros(hidden_size)
        #  update the value function 
        for step in range(len(state_list)):
            # this time only consider the state visited in this episode
            g_t_lambd = calculate_g_t_lambd(step, g_t_list, lambd)
            state_value[state_list[step]] += self.learning_rate * (g_t_lambd - state_value[state_list[step]])
            current_state = state_list[step]
            v_hat = np.dot(self.w, current_state)
            delta_t = alpha * (current_value - v_hat)*current_state
            E = gamma * lambd * E
            E += current_state
            dw = alpha * delta_t * E
            w = self.w - dw
            self.w = w
        return state_value
    
    # the update function running over all the episodes
    def update(self, pol:Policy):
        state_action_dict = self.mdp.state_action_dict
        # initate the value for each state as 0
        V_s = {s: 0. for s in sa_dict.keys()}
        
        # action generator for each state
        action_generator_for_each_state = {s: random_generator_based_on_prob(pol.get_state_probabilities(s))
                        for s in sa_dict.keys()}
        episodes = 0
        updates = 0

        for episode in range(self.max_episodes):
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            V_s = self.get_one_TDLambda_update(pol, copy.deepcopy(V_s), action_generator_for_each_state)

        return V_s
    

hello world
