In [262]:
%load_ext autoreload
%autoreload 2

from MarkovDecisionProcess import MarkovDecisionProcess
from PolicyNetwork import PolicyNetwork
from ValueNetwork import ValueNetwork
from ActorCritic import ActorCritic
from solution import solve 

import numpy as np
import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**States:** Wealth and income shock, $S_t = (W_{t-1}, Z_t)$

**Action:** Consumption, $A_t = C_t$

**Transitions:**

\begin{align*}
    W_t &= W_{t-1}R + Z_t - C_t \\
    Z_t &= \rho Z_{t-1} + \epsilon_t \\
    \epsilon_t &\sim \log N(0, \sigma_\epsilon^2) \\
    \sigma_{\epsilon} &= 0.25 \\
    \rho &= 0.5 \\
    R &= 1.1 \\
    W_{-1} &= 1
\end{align*}

**Action space:** 

\begin{align*}
    \mathcal{A}(\mathcal{S}) = \{C_t \geq 0 : W_{t-1} \geq W_{min} = 0\}
\end{align*}

**Reward:**

\begin{align*}
    U_t(C_t) &= \frac{C_t^{1-\sigma} - 1}{1 - \sigma} \\
    \sigma &= 0.9
\end{align*}

In [263]:
def action_bounds_func(state):
    R = 1.1
    low = 0
    high = state[0] * R + state[1]

    return low, high

def transition_func(state, action):
    R = 1.1
    sd = 0.1
    mean = -0.5 * sd
    rho = 0.25

    epsilon = np.random.lognormal(mean, sd)
    
    new_state = state.copy()
    new_state[0] = state[0] * R + state[1] - action[0]
    new_state[1] = state[1] * rho + epsilon
    
    return new_state

def reward_func(state, action):
    sigma = 0.9

    return (action[0]**(1-sigma) - 1) / (1 - sigma)

In [264]:
example = MarkovDecisionProcess(state = np.array([10,0], dtype = float),
                                action = np.array([0], dtype = float),
                                action_bounds_func = action_bounds_func,
                                transition_func = transition_func,
                                reward_func = reward_func,
                                T = 500000)

In [265]:
print(example)

Current state: [10.  0.] 
Current action: [0.] 
Current reward: None 
Transition function: transition_func(state, action) 
Reward function: reward_func(state, action)


In [266]:
value_network = ValueNetwork(example)
policy_network = PolicyNetwork(example)
actor_critic = ActorCritic(mdp = example,
                           value_network = value_network,
                           policy_network = policy_network,
                           alpha_value = 0.01, alpha_policy = 0.01, alpha_reward = 0.01,
                           lambda_value = 0.2, lambda_policy = 0.2)

In [267]:
state = torch.tensor(example.get_state(), dtype=torch.float32)
policy_network(state)

Normal(loc: tensor([0.], grad_fn=<AddBackward0>), scale: tensor([1.], grad_fn=<ExpBackward0>))

In [268]:
solve(actor_critic)

Average reward at t = 10000:  -0.031753151297569275
State:  [2.17408323 1.24762698]
Average reward at t = 20000:  -0.03846532571315765
State:  [1.77497768 1.23627714]
Average reward at t = 30000:  -0.06031827974319458
State:  [1.88844097 1.15725276]
Average reward at t = 40000:  -0.047780316472053525
State:  [2.5510633  1.51888097]
Average reward at t = 50000:  -0.04005705010890961
State:  [3.16992021 1.22337559]
Average reward at t = 60000:  -0.035222853183746336
State:  [1.78068233 1.25494501]
Average reward at t = 70000:  -0.023138280630111693
State:  [0.72403753 1.26429424]
Average reward at t = 80000:  -0.0724737604856491
State:  [0.51724124 1.25097717]
Average reward at t = 90000:  -0.018690505623817445
State:  [3.79608965 1.06374379]
Average reward at t = 100000:  -0.053693284153938294
State:  [0.08629894 1.27730246]
Average reward at t = 110000:  -0.05367028284072876
State:  [1.59760964 1.1534783 ]
Average reward at t = 120000:  -0.06267191481590272
State:  [3.75892615 1.382890