### Assignment 12

#### Question 1
Write code for the interface for RL algorithms with value function approximation. The core of this interface should be a function from a (state, action) pair to a sampling of the (next state, reward) pair. It is important that this interface doesn't present the state-transition probability model or the reward model.

In [1]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence, Callable, Tuple
import torch

In [2]:
from torch import nn

In [4]:
# Policy approximation for Pi function
class Pi_NN(nn.Module):
    def __init__(self, input_size, state_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, state_size, bias = True)
        )
        self.softmax = torch.nn.Softmax()
    def forward(self, feature):
        out = self.approximator(feature)
        out = self.softmax(out)
        return out

# Function approximation for the Q function     
class Q_NN(nn.Module):
    def __init__(self, input_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, 1, bias = True)
        )
    def forward(self, feature):
        out = self.approximator(feature)
        return out

# Function approximation for V function    
class V_NN(nn.Module):
    def __init__(self, input_size, hidden_size = 50):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.approximator = nn.Sequential(
            nn.Linear(input_size, 2 * hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(2 * hidden_size, hidden_size, bias = True),
            nn.ReLU(),
            nn.Linear(hidden_size, 1, bias = True)
        )
    def forward(self, feature):
        out = self.approximator(feature)
        return out
            

In [None]:
class Policy:
    """
    The stochasic policy that maps a state to the transition probability
    The policy is a mapping from a state to an action, i.e. given a state, figuring out the probability of selecting some actions
    """
    def __init__(self, data: Dict[S, Mapping[A, float]]) -> None:
        self.data = data

    def get_state_probabilities(self, state: S) -> Mapping[A, float]:
        return self.data[state]

    def get_state_action_probability(self, state: S, action: A) -> float:
        return self.get_state_probabilities(state).get(action, 0.)

    # update state/ action probability using epsilon greedy
    def update_state_action_to_epsilon_greedy(self, state: S, action_value_dict: Mapping[A, float], epsilon: float):
        max_act = max(action_value_dict.items(), key=itemgetter(1))[0]
        if epsilon == 0:
            ret = {max_act: 1.}
        else:
            ret = {a: epsilon / len(action_value_dict) + (1. - epsilon if a == max_act else 0.) for a in action_value_dict.keys()}
        self.policy_data[state] = ret

    def __repr__(self):
        return self.policy_data.__repr__()

    def __str__(self):
        return self.policy_data.__str__()

class RLFuncApproxBase(OptBase):

    NUM_SAMPLES_PER_ACTION = 10

    def __init__(self, mdp_rep_for_rl, epsilon: float, num_episodes: int, max_steps: int, fa_spec):

        self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl
        
        # TODO: epsilon with decay should be allowed
        self.epsilon = epsilon
        self.num_episodes: int = num_episodes
        self.max_steps: int = max_steps
        self.vf_fa: FuncApproxBase = V_NN(len(self.mdp_rep.states))
        self.qvf_fa: FuncApproxBase = Q_NN(len(self.mdp_rep.states))
        # the simulator of getting the next, given the current state and action
        self.state_action_func = self.mdp_rep.state_action_func

    def get_init_policy_func(self) -> PolicyActDictType:
        return Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()})

    def get_value_func_fa(self, polf: PolicyActDictType) -> VFType:
        qv_func = self.vf_fa # the function approximation of the reward in each state
        # return the expectation of current states
        return sum(polf(s)[a] * qv_func(s)(a) for a in self.state_action_func(s))


    def get_value_func(self, pol_func: PolicyType) -> VFType:
        return self.get_value_func_fa(lambda s, pol_func=pol_func: get_pdf_from_samples(pol_func(s)(len(self.state_action_func(s)) * RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)))


    def get_act_value_func(self, pol_func: PolicyType) -> QFType:
        # a weighted average
        return self.get_qv_func_fa(
            lambda s, pol_func=pol_func: get_pdf_from_samples(
                pol_func(s)(len(self.state_action_func(s)) *
                            RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)
            )
        )

    def get_optimal_det_policy_func(self) -> Callable[[S], A]:
        qv_func = self.get_qv_func_fa(None)

        # noinspection PyShadowingNames
        def detp_func(s: S, qv_func=qv_func) -> A:
            return max(
                [(a, qv_func(s)(a)) for a in self.state_action_func(s)],
                key=itemgetter(1)
            )[0]

        return detp_func
