In [1]:
import numpy as np
import torch
import os

from entmax import sparsemax

In [2]:
def load_weights(path_to_weights):
    # actor parameters
    actor_weights = {}
    for file in os.listdir(path_to_weights):
        if 'actor' in file:
            param_name = file.replace('actor_', '').replace('.txt', '')
            actor_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_actor = actor_weights['simple_fc1.weight']
    b_actor = actor_weights['simple_fc1.bias']
    
    # critic parameters
    critic_weights = {}
    for file in os.listdir(path_to_weights):
        if 'critic' in file:
            param_name = file.replace('critic_', '').replace('.txt', '')
            critic_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_critic = critic_weights['simple_fc1.weight']
    b_critic = critic_weights['simple_fc1.bias']

    return {'W_actor': W_actor,
            'b_actor': b_actor,
            'W_critic': W_critic,
            'b_critic': b_critic}

In [3]:
class Actor():
    def __init__(self, weights, bias):
        self.W = torch.tensor(weights, requires_grad=True)
        self.b = torch.tensor(bias, requires_grad=True)
        
    def forward(self, state):
        return torch.nn.functional.softmax(self.W @ state + self.b, dim=0)

In [4]:
weights = load_weights('../weights')

n_actions = len(weights['b_actor'])

actor = Actor(weights['W_actor'], weights['b_actor'])

Since the $Q_{\theta}(S_{t}, \cdot)$ values don't matter when computing $\nabla_{\psi}J_{\pi}(\psi)$, we can choose a tensor of constant values

In [5]:
qvals = torch.tensor(np.ones(n_actions), requires_grad=False).double()
state = torch.tensor(np.linspace(1, n_actions, n_actions), requires_grad=False).double()

In [6]:
policy = actor.forward(state)

In [7]:
linear_term  = policy * qvals
entropy_term = -policy * torch.log(policy) # Shannon Entropy

loss = - torch.sum(linear_term + entropy_term)

In [8]:
# backward pass
loss.backward()

In [9]:
actor.W.grad.numpy()

array([[-0.03790628, -0.07581256, -0.11371884, -0.15162512],
       [-0.17575225, -0.35150449, -0.52725674, -0.70300898],
       [ 0.34409185,  0.6881837 ,  1.03227554,  1.37636739],
       [-0.13043332, -0.26086665, -0.39129997, -0.52173329]])

In [10]:
# analytical result
π = policy.detach().numpy()

grad_soft = np.diag(π) - np.outer(π, π)
-(qvals.numpy() - np.log(π) - 1) * grad_soft @ np.array([state for i in range(n_actions)])

array([[-0.03790628, -0.07581256, -0.11371884, -0.15162512],
       [-0.17575225, -0.35150449, -0.52725674, -0.70300898],
       [ 0.34409185,  0.6881837 ,  1.03227554,  1.37636739],
       [-0.13043332, -0.26086665, -0.39129997, -0.52173329]])