In [4]:
from typing import Sequence, Callable, Tuple, TypeVar
from func_approx.func_approx_base import FuncApproxBase
from func_approx.eligibility_traces import get_decay_toeplitz_matrix
from scipy.stats import norm
import numpy as np

X = TypeVar('X')


class LinearApprox(FuncApproxBase):

    def __init__(
        self,
        feature_funcs: Sequence[Callable[[X], float]],
        reglr_coeff: float = 0.,
        learning_rate: float = 0.1,
        adam: bool = True,
        adam_decay1: float = 0.9,
        adam_decay2: float = 0.99,
        add_unit_feature: bool = True
    ):
        super().__init__(
            feature_funcs,
            reglr_coeff,
            learning_rate,
            adam,
            adam_decay1,
            adam_decay2,
            add_unit_feature
        )

    def init_params(self) -> Sequence[np.ndarray]:
        return [np.zeros(self.num_features)]

    def init_adam_caches(self)\
            -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]:
        return [np.zeros(self.num_features)],\
               [np.zeros(self.num_features)]

    def get_func_eval(self, x_vals: X):
        """
        This must return a float but lint is not happy, so removed the
        return type annotation
        """
        return np.dot(self.get_feature_vals(x_vals), self.params[0])

    def get_func_eval_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray:
        return np.dot(
            self.get_feature_vals_pts(x_vals_seq),
            self.params[0]
        )

    def get_sum_loss_gradient(
        self,
        x_vals_seq: Sequence[X],
        supervisory_seq: Sequence[float]
    ) -> Sequence[np.ndarray]:
        # return [np.dot(self.get_func_eval_pts(x_vals_seq) - supervisory_seq,
        #               self.get_feature_vals_pts(x_vals_seq))]
        return [np.sum((self.get_func_eval(x) - supervisory_seq[i]) * self.get_feature_vals(x)
                       for i, x in enumerate(x_vals_seq))]

    # noinspection PyPep8Naming
    def get_sum_objective_gradient(
        self,
        x_vals_seq: Sequence[X],
        dObj_dOL: np.ndarray
    ) -> Sequence[np.ndarray]:
        return [dObj_dOL.dot(self.get_feature_vals_pts(x_vals_seq))]

    def get_el_tr_sum_loss_gradient(
        self,
        x_vals_seq: Sequence[X],
        supervisory_seq: Sequence[float],
        gamma_lambda: float
    ) -> Sequence[np.ndarray]:
        toeplitz_mat = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda)
        errors = self.get_func_eval_pts(x_vals_seq) - supervisory_seq
        func_grad = self.get_feature_vals_pts(x_vals_seq)
        return [errors.dot(toeplitz_mat.dot(func_grad))]

    # noinspection PyPep8Naming
    def get_el_tr_sum_objective_gradient(
        self,
        x_vals_seq: Sequence[X],
        dObj_dOL: np.ndarray,
        factors: np.ndarray,
        gamma_lambda: float
    ) -> Sequence[np.ndarray]:
        toep = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda)
        features = self.get_feature_vals_pts(x_vals_seq)
        return [factors.dot(toep.dot(np.diag(dObj_dOL).dot(features)))]


if __name__ == '__main__':
    print(FuncApproxBase.get_identity_feature_funcs(3))
    la = LinearApprox(
        feature_funcs=FuncApproxBase.get_identity_feature_funcs(3),
        reglr_coeff=0.,
        learning_rate=0.1,
        adam=True,
        adam_decay1=0.9,
        adam_decay2=0.999,
        add_unit_feature=True
    )
    alpha = 2.0
    beta_1 = 10.0
    beta_2 = 4.0
    beta_3 = -6.0
    beta = (beta_1, beta_2, beta_3)
    x_pts = np.arange(-10.0, 10.0, 0.5)
    y_pts = np.arange(-10.0, 10.0, 0.5)
    z_pts = np.arange(-10.0, 10.0, 0.5)
    pts = [(x, y, z) for x in x_pts for y in y_pts for z in z_pts]

    # noinspection PyShadowingNames
    def superv_func(pt, alpha=alpha, beta=beta):
        return alpha + np.dot(beta, pt)

    n = norm(loc=0., scale=1.)
    superv_pts = [superv_func(r) + n.rvs(size=1)[0] for r in pts]
    # import matplotlib.pyplot as plt
    for _ in range(1000):
        print(la.params[0])
        la.update_params(pts, superv_pts)
        pred_pts = [la.get_func_eval(x) for x in pts]
        print(np.linalg.norm(np.array(pred_pts) - np.array(superv_pts)) /
              np.sqrt(len(superv_pts)))



[<function FuncApproxBase.get_identity_feature_funcs.<locals>.<listcomp>.<lambda> at 0x7f7cd7d15ea0>, <function FuncApproxBase.get_identity_feature_funcs.<locals>.<listcomp>.<lambda> at 0x7f7cd7d15f28>, <function FuncApproxBase.get_identity_feature_funcs.<locals>.<listcomp>.<lambda> at 0x7f7cd7d0c048>]
[ 0.  0.  0.  0.]
70.22884191
[-0.09855153  0.09999999  0.09999998 -0.09999998]
68.9752885033
[ -1.87655047e-04   2.34346614e-01   2.34283332e-01  -2.34320870e-01]
67.5208220492
[ 0.12234774  0.39075003  0.39044666 -0.39062384]
65.9301885001
[ 0.21086095  0.56251105  0.56163142 -0.56214302]
64.246333293
[ 0.23432442  0.74523803  0.74324672 -0.74440678]
62.5005610233
[ 0.22335852  0.93578979  0.93191747 -0.93418425]
60.7167739211
[ 0.21833592  1.13180451  1.12501895 -1.12901607]
58.9137034425
[ 0.24947707  1.33144958  1.32043234 -1.32696543]
57.1064178453
[ 0.31663125  1.53327292  1.51640135 -1.52647056]
55.3072458548
[ 0.40503119  1.73610728  1.71144114 -1.72625082]
53.5263001971
[ 0.495

1.66227521257
[ 1.94330486  9.77092173  4.00452355 -5.99253018]
1.62736906121
[ 1.94506272  9.77853312  4.00417627 -5.99247056]
1.59400750411
[ 1.94679972  9.78589926  4.00381402 -5.99244906]
1.5621410935
[ 1.94852665  9.79302782  4.00344152 -5.992462  ]
1.5317212296
[ 1.95022622  9.79992628  4.00306311 -5.99250591]
1.50270009906
[ 1.95186523  9.80660185  4.00268275 -5.9925775 ]
1.47503062719
[ 1.95341261  9.81306155  4.00230403 -5.99267365]
1.44866645271
[ 1.95485462  9.81931218  4.00193017 -5.99279146]
1.42356191631
[ 1.95620076  9.82536032  4.00156404 -5.99292818]
1.39967205173
[ 1.95747859  9.83121235  4.00120813 -5.99308126]
1.3769525815
[ 1.9587209   9.83687445  4.00086462 -5.99324832]
1.35535992601
[ 1.9599517   9.84235264  4.00053533 -5.99342713]
1.33485122475
[ 1.96117744  9.84765272  4.00022181 -5.99361563]
1.31538436039
[ 1.96238693  9.85278033  3.99992531 -5.99381192]
1.29691798158
[ 1.96355903  9.85774092  3.99964679 -5.99401424]
1.27941152934
[ 1.96467408  9.86253981  3.9

KeyboardInterrupt: 

In [1]:
from algorithms.dp.dp_base import DPBase
from processes.policy import Policy
from processes.det_policy import DetPolicy
from processes.mp_funcs import mdp_rep_to_mrp_rep1, mdp_rep_to_mrp_rep2
from processes.mdp import MDP
from utils.standard_typevars import VFDictType


class DPNumeric(DPBase):

    def __init__(self, mdp_obj: MDP, tol: float) -> None:
        super().__init__(mdp_obj, tol)

    def get_value_func_dict(self, pol: Policy) -> VFDictType:
        vf = {s: 0. for s in self.mdp_obj.all_states}
        epsilon = self.tol * 1e4
        mo = self.mdp_obj
        pd = pol.policy_data
        rew = mdp_rep_to_mrp_rep2(mo.rewards, pd)
        prob = mdp_rep_to_mrp_rep1(mo.transitions, pd)
        while epsilon >= self.tol:
            new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1]
                                                 for s1, p in prob[s].items())
                      for s in mo.all_states}
            
            epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
            vf = new_vf
        return vf

    def get_optimal_det_policy(self) -> DetPolicy:
        return self.get_optimal_policy_vi()

if __name__ == '__main__':
    from processes.mdp import MDP
    policy_data = {
        2: {'a': 0.7, 'c': 0.3},
        1: {'a': 0.4, 'b': 0.6},
        3: {'b': 1.0}
    }
    pol_obj = Policy(policy_data)
    mdp_data = {
        1: {
            'a': ({1: 0.2, 2: 0.6}, 11.0),
            'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0),
            'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0)
        },
        2: {
            'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0),
            'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2)
        },
        3: {
            'b': ({3: 1.0}, 0.0)
        }
    }
    gamma_val = 0.9
    mdp1_obj = MDP(mdp_data, gamma_val)
    mrp1_obj = mdp1_obj.get_mrp(pol_obj)
    print("transition",mrp1_obj.transitions)
    print("reward",mrp1_obj.rewards)
    print("transition matrix",mrp1_obj.trans_matrix)
    print("reward",mrp1_obj.rewards_vec)
    print("value function vec", mrp1_obj.get_value_func_vec())
    tol_val = 1e-4
    opn = DPNumeric(mdp1_obj, tol_val)
    opt_policy_pi = opn.get_optimal_policy_pi()
    print("opt_policy func", opn.get_optimal_det_policy_func())
    print("opt_policy_pi", opt_policy_pi)
    opt_vf_dict_pi = opn.get_value_func_dict(opt_policy_pi)
    print("opt falue func pi",opt_vf_dict_pi)
    opt_policy_vi = opn.get_optimal_policy_vi()
    print("policy vi",opt_policy_vi)
    opt_vf_dict_vi = opn.get_value_func_dict(opt_policy_vi)
    print("opt value vi",opt_vf_dict_vi)

transition {1: {1: 0.44, 2: 0.42, 3: 0.06}, 2: {1: 0.25, 2: 0.48, 3: 0.27}, 3: {3: 1.0}}
reward {1: 3.2, 2: 0.33999999999999997, 3: 0.0}
transition matrix [[ 0.44  0.42]
 [ 0.25  0.48]]
reward [ 3.2   0.34]
value function vec [ 7.54245762  3.58636085]
opt_policy func <function TabularBase.get_optimal_det_policy_func.<locals>.<lambda> at 0x7f5f5b8b78c8>
opt_policy_pi {1: 'a', 2: 'c', 3: 'b'}
opt falue func pi {1: 21.985047318913182, 2: 13.014459083619075, 3: 0.0}
policy vi {1: 'a', 2: 'a', 3: 'b'}
opt value vi {1: 17.04185365184756, 2: 5.508093903827139, 3: 0.0}


In [58]:
from typing import Mapping, Callable, Sequence, Set
from algorithms.helper_funcs import get_uniform_policy_func
from algorithms.opt_base import OptBase
from algorithms.helper_funcs import get_epsilon_decay_func
from processes.mdp_rep_for_adp import MDPRepForADP
from algorithms.helper_funcs import get_soft_policy_func_from_qf
from algorithms.func_approx_spec import FuncApproxSpec
from func_approx.func_approx_base import FuncApproxBase
from processes.mp_funcs import mdp_func_to_mrp_func1, mdp_func_to_mrp_func2
from processes.mp_funcs import get_expected_action_value
from algorithms.helper_funcs import get_pdf_from_samples
from copy import deepcopy
from operator import itemgetter
import numpy as np
from utils.generic_typevars import S, A
from utils.standard_typevars import VFType, QFType
from utils.standard_typevars import PolicyType, PolicyActDictType


class ADP(OptBase):

    NUM_SAMPLES_PER_ACTION = 10

    def __init__(
        self,
        mdp_rep_for_adp: MDPRepForADP,
        num_samples: int,
        softmax: bool,
        epsilon: float,
        epsilon_half_life: float,
        tol: float,
        fa_spec: FuncApproxSpec
    ) -> None:
        self.mdp_rep: MDPRepForADP = mdp_rep_for_adp
        self.num_samples: int = num_samples
        self.softmax: bool = softmax
        self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
            epsilon,
            epsilon_half_life
        )
        self.tol: float = tol
        self.fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj()
        self.state_action_func: Callable[[S], Set[A]] =\
            self.mdp_rep.state_action_func

    @staticmethod
    def get_gradient_max(gradient: Sequence[np.ndarray]) -> float:
        return max(np.max(np.abs(g)) for g in gradient)

    def get_init_policy_func(self) -> PolicyActDictType:
        return get_uniform_policy_func(self.state_action_func)

    def get_value_func_fa(self, polf: PolicyActDictType) -> VFType:
        epsilon = self.tol * 1e4
        mo = self.mdp_rep
        rew_func = mdp_func_to_mrp_func2(mo.reward_func, polf)
        prob_func = mdp_func_to_mrp_func1(mo.transitions_func, polf)
        samples_func = mo.sample_states_gen_func
        # if update in the gradient of the function approximation is less than a threshold, stop.
        while epsilon >= self.tol:
            samples = samples_func(self.num_samples)
            values = [rew_func(s) + mo.gamma *
                      sum(p * self.fa.get_func_eval(s1) for s1, p in
                          prob_func(s).items())
                      for s in samples]
            avg_grad = [g / len(samples) for g in self.fa.get_sum_loss_gradient(
                samples,
                values
            )]
            self.fa.update_params_from_gradient(avg_grad)
            epsilon = ADP.get_gradient_max(avg_grad)

        return self.fa.get_func_eval

    def get_act_value_func_fa(self, polf: PolicyActDictType) -> QFType:
        v_func = self.get_value_func_fa(polf)
        mo = self.mdp_rep

        # noinspection PyShadowingNames
        def state_func(s: S, mo=mo, v_func=v_func) -> Callable[[A], float]:

            # noinspection PyShadowingNames
            def act_func(a: A, mo=mo, v_func=v_func) -> float:
                return mo.reward_func(s, a) + mo.gamma *\
                       sum(p * v_func(s1) for s1, p in
                           mo.transitions_func(s, a).items())

            return act_func

        return state_func

    def get_value_func(self, pol_func: PolicyType) -> VFType:
        return self.get_value_func_fa(
            lambda s: get_pdf_from_samples(
                pol_func(s)(len(self.state_action_func(s)) *
                            ADP.NUM_SAMPLES_PER_ACTION)
            )
        )

    def get_act_value_func(self, pol_func: PolicyType) -> QFType:
        return self.get_act_value_func_fa(
            lambda s: get_pdf_from_samples(
                pol_func(s)(len(self.state_action_func(s)) *
                            ADP.NUM_SAMPLES_PER_ACTION)
            )
        )

    # noinspection PyShadowingNames
    def get_optimal_policy_func_pi(self) -> Callable[[S], A]:
        this_polf = self.get_init_policy_func()
        eps = self.tol * 1e4
        iters = 0
        params = deepcopy(self.fa.params)
        while eps >= self.tol:
            qvf = self.get_act_value_func_fa(this_polf)
            this_polf = get_soft_policy_func_from_qf(
                qf=lambda sa, qvf=qvf: qvf(sa[0])(sa[1]),
                state_action_func=self.state_action_func,
                softmax=self.softmax,
                epsilon=self.epsilon_func(iters)
            )
            new_params = deepcopy(self.fa.params)
            eps = ADP.get_gradient_max(
                [new_params[i] - p for i, p in enumerate(params)]
            )
            params = new_params
            iters += 1

        # noinspection PyShadowingNames
        def det_pol(s: S, this_polf=this_polf) -> A:
            return max(this_polf(s).items(), key=itemgetter(1))[0]

        return det_pol

    def get_optimal_policy_func_vi(self) -> Callable[[S], A]:
        mo = self.mdp_rep
        samples_func = mo.sample_states_gen_func
        rew_func = mo.reward_func
        tr_func = mo.transitions_func
        eps = self.tol * 1e4
        iters = 0
        print("self.fa.params", self.fa.params)
        params = deepcopy(self.fa.params)
        while eps >= self.tol:
            samples = samples_func(self.num_samples)
            values = [get_expected_action_value(
                {a: rew_func(s, a) + mo.gamma *
                    sum(p * self.fa.get_func_eval(s1)
                        for s1, p in tr_func(s, a).items())
                 for a in self.state_action_func(s)},
                self.softmax,
                self.epsilon_func(iters)
            ) for s in samples]
            self.fa.update_params(samples, values)
            new_params = deepcopy(self.fa.params)
            eps = ADP.get_gradient_max(
                [new_params[i] - p for i, p in enumerate(params)]
            )
            params = new_params
            iters += 1

        # noinspection PyShadowingNames
        def deter_func(s: S, rew_func=rew_func, tr_func=tr_func) -> A:
            return max(
                [(a, rew_func(s, a) +
                  sum(p * self.fa.get_func_eval(s1) for s1, p in
                      tr_func(s, a).items()))
                 for a in self.state_action_func(s)],
                key=itemgetter(1)
            )[0]
        

        return deter_func, new_params

    def get_optimal_det_policy_func(self) -> Callable[[S], A]:
        return self.get_optimal_policy_func_vi()


if __name__ == '__main__':
    from processes.mdp_refined import MDPRefined
    from func_approx.dnn_spec import DNNSpec

    mdp_refined_data = {
        1: {
            'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
            'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
            'c': {1: (0.8, 14.8), 2: (0.1, -4.9), 3: (0.1, 0.0)}
        },
        2: {
            'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
            'b': {1: (0.3, 19.8), 2: (0.6, 16.7), 3: (0.1, 1.8)},
            'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
        },
        3: {
            'a': {3: (1.0, 0.0)},
            'b': {3: (1.0, 0.0)}
        }
    }
    gamma_val = 0.9
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp()

    num_samples_val = 100
    softmax_flag = False
    epsilon_val = 0.0
    epsilon_half_life_val = 30
    tol_val = 1e-4
    state_ff = [
        lambda s: 1. if s == 1 else 0.,
        lambda s: 1. if s == 2 else 0.,
        lambda s: 1. if s == 3 else 0.
    ]
    fa_spec_val = FuncApproxSpec(
        state_feature_funcs=state_ff,
        sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff],
        dnn_spec=None
    )
    adp_obj = ADP(
        mdp_rep_for_adp=mdp_rep_obj,
        num_samples=num_samples_val,
        softmax=softmax_flag,
        epsilon=epsilon_val,
        epsilon_half_life=epsilon_half_life_val,
        tol=tol_val,
        fa_spec=fa_spec_val
    )

    def policy_func(i: int) -> Mapping[str, float]:
        if i == 1:
            ret = {'a': 0.4, 'b': 0.6}
        elif i == 2:
            ret = {'a': 0.7, 'c': 0.3}
        elif i == 3:
            ret = {'b': 1.0}
        else:
            raise ValueError
        return ret

    this_qf = adp_obj.get_act_value_func_fa(policy_func)
    this_vf = adp_obj.get_value_func_fa(policy_func)
    print("Printing vf for a policy")
    print(this_vf(1))
    print(this_vf(2))
    print(this_vf(3))
    print("Printing DP vf for a policy")
    from processes.policy import Policy
    true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(Policy(
        {s: policy_func(s) for s in {1, 2, 3}}
    ))
    print(true_vf_for_pol)

    opt_det_polf, params = adp_obj.get_optimal_policy_func_vi()
    print(params)
    # noinspection PyShadowingNames
    def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
        return {opt_det_polf(s): 1.0}

    print("Printing Opt Policy")
    print(opt_polf(1))
    print(opt_polf(2))
    print(opt_polf(3))

    opt_vf = adp_obj.get_value_func_fa(opt_polf)
    print("Printing Opt VF")
    print(opt_vf(1))
    print(opt_vf(2))
    print(opt_vf(3))
    true_opt = mdp_ref_obj1.get_optimal_policy(tol=tol_val)
    print("Printing DP Opt Policy")
    print(true_opt)
    true_vf = mdp_ref_obj1.get_value_func_dict(true_opt)
    print("Printing DP Opt VF")
    print(true_vf)


sa_feature_funcs [<function <listcomp>.<lambda> at 0x7f5f54573bf8>, <function <listcomp>.<lambda> at 0x7f5f54573e18>, <function <listcomp>.<lambda> at 0x7f5f54573840>]
Printing vf for a policy
10.3207716331
15.3963816114
9.16309503474e-06
Printing DP vf for a policy
{1: 10.320570378088227, 2: 15.396689886917194, 3: 0.0}
self.fa.params [array([ 7.06383824,  3.25693339,  8.33254337, -7.06382908])]
[array([ 35.00386533,  28.8604946 ,  37.56951052, -35.00389831])]
Printing Opt Policy
{'c': 1.0}
{'b': 1.0}
{'a': 1.0}
Printing Opt VF
63.8624619254
72.5719832823
-3.42995748781e-06
Printing DP Opt Policy
{1: 'c', 2: 'b', 3: 'a'}
Printing DP Opt VF
{1: 63.862200956937841, 2: 72.571291866028744, 3: 0.0}


In [40]:
state_ff = [
    lambda s: 1. if s == 1 else 0.,
    lambda s: 1. if s == 2 else 0.,
    lambda s: 1. if s == 3 else 0.
]
sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff]

In [55]:
for fa in state_ff:
    print(fa)
    sa_feature_funcs([2], fa)

<function <lambda> at 0x7f5f5468bc80>


TypeError: 'list' object is not callable

TypeError: 'list' object is not callable