In [1]:
import numpy as np

Notation:

- $S_t = [S_1, S_2]$
- $\mathcal{A} \in \{a_1, a_2\}$
- $\pi_{\psi}(a_i\mid s_t) \coloneqq \pi_i(s_t)$

We're having problems computing the gradient of the policy loss. This notebook will serve as validation of analytical calculation results

We'll start by defining the variables for the $Q_{\theta}(s_t, a_t)$ function. As these aren't a function of parameters $\psi$, we'll create them as standalone variables.

In [2]:
Q = var('q1 q2')

We now define the variables that make up the policy, according to the ANN architecture

In [3]:
W = var('w11 w12 w21 w22') # weights
b = var('b1 b2')           # bias
S = var('s1 s2')           # state

# output layer
z = [S[0]*W[0] + S[1]*W[1] + b[0], 
     S[0]*W[2] + S[1]*W[3] + b[1]]

# softmax activation function
def softmax(z, i):
    return exp(z[i]) / sum(exp(z[j]) for j in range(len(z)))
    
# output layer after activation function
π = [softmax(z, i) for i in range(len(z))]

In [4]:
show(π, viwer='pdf') # render output in latex

And the value function:

In [5]:
W_v = var('w11_v w12_v w21_v w22_v') # weights
b_v = var('b1_v b2_v')               # bias

# output layer
z_v = [S[0]*W_v[0] + S[1]*W_v[1] + b_v[0], 
       S[0]*W_v[2] + S[1]*W_v[3] + b_v[1]]

The loss functions are defined as:

In [6]:
# -- Policy Loss --

# negative expected regularized reward
J_π = -(π[0]*(Q[0] - log(π[0])) + π[1]*(Q[1] - log(π[1])))
show(J_π, viwer='pdf')

In [7]:
# -- Value Loss --

# sampled state and sampled action
ss, sa = var('ss sa')

# we'll consider that the sampled action is 1
sa = 1

# here, Q_v should be Q(ss, sa)
Q_v = z_v[sa]

# traget (eq. between (9) and (10))
target = var('target') # is a constant 

# bellman residual  - 1/2 * (Q(ss, sa) - target)^2
J_q = 1/2*(Q_v - target)^2
show(J_q, viwer='pdf')

We now have everything ready to compute the policy loss!

# $\nabla_{\psi_{W}}J_{\pi}(\psi)$

In [8]:
dJ_dw11 = J_π.diff(w11)
dJ_dw12 = J_π.diff(w12)
dJ_dw21 = J_π.diff(w21)
dJ_dw22 = J_π.diff(w22)

In [9]:
show(dJ_dw11, viwer='pdf')

In [10]:
show(dJ_dw12, viwer='pdf')

In [11]:
show(dJ_dw21, viwer='pdf')

In [12]:
show(dJ_dw22, viwer='pdf')

# $\nabla_{\psi_{b}}J_{\pi}(\psi)$

In [13]:
dJ_db1 = J_π.diff(b1)
dJ_db2 = J_π.diff(b2)

In [14]:
show(dJ_db1, viwer='pdf')

In [15]:
show(dJ_db2, viwer='pdf')

And the value loss!

# $\nabla_{\theta_{W}}J_{Q}(\theta)$

In [16]:
dJq_dw11 = J_q.diff(w11_v)
dJq_dw12 = J_q.diff(w12_v)
dJq_dw21 = J_q.diff(w21_v)
dJq_dw22 = J_q.diff(w22_v)

In [17]:
show(dJq_dw11, viwer='pdf')

In [18]:
show(dJq_dw12, viwer='pdf')

In [19]:
show(dJq_dw21, viwer='pdf')

In [20]:
show(dJq_dw22, viwer='pdf')

# $\nabla_{\theta_{b}}J_{Q}(\theta)$

In [21]:
dJq_db1 = J_q.diff(b1_v)
dJq_db2 = J_q.diff(b2_v)

In [22]:
show(dJq_db1, viwer='pdf')

In [23]:
show(dJq_db2, viwer='pdf')

## Simplified Results for Policy

### Weights

In [24]:
dJ_dw11_simplified = s1*((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])
(dJ_dw11 - dJ_dw11_simplified).full_simplify() == 0 # To confirm results! 

0 == 0

In [25]:
dJ_dw12_simplified = s2*((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])
(dJ_dw12 - dJ_dw12_simplified).full_simplify() == 0

0 == 0

In [26]:
dJ_dw21_simplified = s1*((q2-log(π[1])-1)*π[1]*(π[1]-1) + (q1-log(π[0])-1)*π[0]*π[1])
(dJ_dw21 - dJ_dw21_simplified).full_simplify() == 0

0 == 0

In [27]:
dJ_dw22_simplified = s2*((q2-log(π[1])-1)*π[1]*(π[1]-1) + (q1-log(π[0])-1)*π[0]*π[1])
(dJ_dw21 - dJ_dw21_simplified).full_simplify() == 0

0 == 0

### Bias

In [28]:
dJ_db1_simplified = (q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1]
(dJ_db1 - dJ_db1_simplified).full_simplify() == 0 # To confirm results! 

0 == 0

In [29]:
dJ_db2_simplified = (q2-log(π[1])-1)*π[1]*(π[1]-1) + (q1-log(π[0])-1)*π[0]*π[1]
(dJ_db2 - dJ_db2_simplified).full_simplify() == 0 # To confirm results! 

0 == 0

## PyTorch Experiment

In [30]:
import numpy as np
from scipy.special import softmax

In [31]:
def load_weights(path_to_weights):
    # actor parameters
    actor_weights = {}
    for file in os.listdir(path_to_weights):
        if 'actor' in file:
            param_name = file.replace('actor_', '').replace('.txt', '')
            actor_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_actor = actor_weights['simple_fc1.weight']
    b_actor = actor_weights['simple_fc1.bias']
    
    # critic parameters
    critic_weights = {}
    for file in os.listdir(path_to_weights):
        if 'critic' in file:
            param_name = file.replace('critic_', '').replace('.txt', '')
            critic_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_critic = critic_weights['simple_fc1.weight']
    b_critic = critic_weights['simple_fc1.bias']

    return {'W_actor': W_actor,
            'b_actor': b_actor,
            'W_critic': W_critic,
            'b_critic': b_critic}

In [32]:
path_to_weights = 'weights'
weights = load_weights(path_to_weights)

In [33]:
weights

{'W_actor': array([[-0.67553341, -0.46830416],
        [-0.29148576,  0.02619374]]),
 'b_actor': array([0.27954417, 0.42428023]),
 'W_critic': array([[-0.00529397,  0.37932295],
        [-0.58198076, -0.52038747]]),
 'b_critic': array([-0.27234524,  0.18961591])}

### Policy Loss

In [34]:
state  = np.array([1., 2.])
s1, s2 = state

π = softmax(weights['W_actor'] @ state + weights['b_actor'])
Q = weights['W_critic'] @ state + weights['b_critic']
q1, q2 = Q

In [35]:
# weights
del_11 = s1*((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])
del_12 = s2*((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])
del_21 = s1*((q2-log(π[1])-1)*π[1]*(π[1]-1) + (q1-log(π[0])-1)*π[0]*π[1])
del_22 = s2*((q2-log(π[1])-1)*π[1]*(π[1]-1) + (q1-log(π[0])-1)*π[0]*π[1])

del_J  = np.array([[del_11, del_12], [del_21, del_22]])
del_J

array([[-0.50608809, -1.01217617],
       [ 0.50608809,  1.01217617]])

In [36]:
# bias
del_1 = ((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])
del_2 = ((q1-log(π[0])-1)*π[0]*(π[0]-1) + (q2-log(π[1])-1)*π[0]*π[1])

del_J  = np.array([del_1, del_2])
del_J

array([-0.50608809, -0.50608809])

### Value Loss

the sampled action (we used argmax) is:

In [37]:
np.argmax(π) # we used this value in the 'sampled action'

1

In [38]:
γ = 0.99
λ = 1

reward      = 2
next_state  = np.array([0.90929743, 0.14112001]) # sampled from the text environment
next_π      = softmax(weights['W_actor'] @ next_state + weights['b_actor'])
next_action = np.argmax(next_π)
next_q      = weights['W_critic'] @ next_state + weights['b_critic']

target = reward + λ*(next_q[next_action] - λ * log(next_π[next_action]))

In [39]:
# weights
delq_11 = 0
delq_12 = 0
delq_21 = s1*(Q[sa] - target)
delq_22 = s2*(Q[sa] - target)

delq_J  = np.array([[delq_11, delq_12], [delq_21, delq_22]])
delq_J

array([[ 0.        ,  0.        ],
       [-3.47061487, -6.94122974]])

In [40]:
delq_1 = 0
delq_2 = (Q[sa] - target)

delq_J  = np.array([delq_1, delq_2])
delq_J

array([ 0.        , -3.47061487])