The weights used in this notebook are generated by the 'algorithm.py' script.
Run it with:

```
python train --num_iterations 1
```


In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import os

from entmax import sparsemax

In [2]:
# Global Variables

γ = 0.99 # discount factor
α = 1
λ = 1

In [3]:
def log_alpha(π, α=1):
    if α == 1:
        return torch.log(π)
    else:
        return (torch.pow(π, α-1)-1)/(α * (α-1))

def Tsallis_Entropy(π, α=1):
        return -π * log_alpha(π, α)

In [4]:
def load_weights(path_to_weights):
    # actor parameters
    actor_weights = {}
    for file in os.listdir(path_to_weights):
        if 'actor' in file:
            param_name = file.replace('actor_', '').replace('.txt', '')
            actor_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_actor = actor_weights['simple_fc1.weight']
    b_actor = actor_weights['simple_fc1.bias']
    
    # critic parameters
    critic_weights = {}
    for file in os.listdir(path_to_weights):
        if 'critic' in file:
            param_name = file.replace('critic_', '').replace('.txt', '')
            critic_weights[param_name] = np.loadtxt(os.path.join(path_to_weights, file))
    
    W_critic = critic_weights['simple_fc1.weight']
    b_critic = critic_weights['simple_fc1.bias']

    return {'W_actor': W_actor,
            'b_actor': b_actor,
            'W_critic': W_critic,
            'b_critic': b_critic}

In [5]:
class Actor():
    def __init__(self, weights, bias):
        self.W = torch.tensor(weights, requires_grad=True)
        self.b = torch.tensor(bias, requires_grad=True)
        
    def forward(self, state):
        return torch.nn.functional.softmax(self.W @ state + self.b, dim=0)

class Critic():
    def __init__(self, weights, bias):
        self.W = torch.tensor(weights, requires_grad=True)
        self.b = torch.tensor(bias, requires_grad=True)
        
    def forward(self, state):
        return self.W @ state + self.b

In [6]:
weights = load_weights('../weights')

n_actions = len(weights['b_actor'])

actor  = Actor(weights['W_actor'], weights['b_actor'])
critic = Critic(weights['W_critic'], weights['b_actor'])

Since the $Q_{\theta}(S_{t}, \cdot)$ values don't matter when computing $\nabla_{\psi}J_{\pi}(\psi)$, we can choose a tensor of constant values

In [7]:
state = torch.tensor(np.linspace(1, n_actions, n_actions), requires_grad=False).double()
qvals = critic.forward(state).detach()

In [8]:
policy = actor.forward(state)

In [9]:
linear_term  = policy * qvals
entropy_term = Tsallis_Entropy(policy, α) # Shannon Entropy

loss = - torch.sum(linear_term + entropy_term)

In [10]:
# backward pass
loss.backward()

In [11]:
actor.W.grad.numpy()

array([[-0.02417415, -0.04834831, -0.07252246, -0.09669661],
       [-0.50155346, -1.00310692, -1.50466037, -2.00621383],
       [ 0.57248761,  1.14497523,  1.71746284,  2.28995045],
       [-0.04676   , -0.09352   , -0.14028001, -0.18704001]])

In [12]:
# analytical result
π = policy.detach().numpy()

grad_soft = np.diag(π) - np.outer(π, π)
-(qvals.numpy() - np.log(π) - 1) * grad_soft @ np.array([state for i in range(n_actions)])

array([[-0.02417415, -0.04834831, -0.07252246, -0.09669661],
       [-0.50155346, -1.00310692, -1.50466037, -2.00621383],
       [ 0.57248761,  1.14497523,  1.71746284,  2.28995045],
       [-0.04676   , -0.09352   , -0.14028001, -0.18704001]])

In [13]:
os.chdir(os.path.dirname('../'))
from algorithm import test_env

# 1 Step

Environment and Gradient steps

In [14]:
# restart nets
actor  = Actor(weights['W_actor'], weights['b_actor'])
critic = Critic(weights['W_critic'], weights['b_actor'])

In [15]:
env = test_env(n_actions)

# initialize environment
state = env.reset()

# Environment step
policy = actor.forward(torch.tensor(state).double())
action = torch.argmax(policy.detach(), dim=0)

next_state, reward, _, _ = env.step(action)

# Gradient step

# Critic
with torch.no_grad():
    next_qvals  = critic.forward(torch.tensor(next_state).double()) 
    next_policy = actor.forward(torch.tensor(next_state).double())
    next_action = torch.argmax(next_policy, dim=0)

    target = reward + γ * (next_qvals[next_action] + (λ/α) * log_alpha(next_policy[next_action]))

qvals  = critic.forward(torch.tensor(state).double())
v_loss = F.mse_loss(qvals[action], target)/2

# Actor
linear_term  = policy * qvals.detach()
entropy_term = Tsallis_Entropy(policy, α) # Shannon Entropy

p_loss = - torch.sum(linear_term + entropy_term)

# backward pass
v_loss.backward()
p_loss.backward()

### Critic Gradients

In [16]:
print("Critic weights' gradient:\n")
print(critic.W.grad.numpy())

print('\n')

print("Critic bias' gradient:\n")
print(critic.b.grad.numpy())

Critic weights' gradient:

[[  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [ -4.09226372  -8.18452744 -12.27679116 -16.36905488]
 [  0.           0.           0.           0.        ]]


Critic bias' gradient:

[ 0.          0.         -4.09226372  0.        ]


### Actor Gradients

In [17]:
print("Actor weights' gradient:\n")
print(actor.W.grad.numpy())

print('\n')

print("Actor bias' gradient:\n")
print(actor.b.grad.numpy())

Actor weights' gradient:

[[-0.02417415 -0.04834831 -0.07252246 -0.09669661]
 [-0.50155346 -1.00310692 -1.50466037 -2.00621383]
 [ 0.57248761  1.14497523  1.71746284  2.28995045]
 [-0.04676    -0.09352    -0.14028001 -0.18704001]]


Actor bias' gradient:

[-0.02417415 -0.50155346  0.57248761 -0.04676   ]


In [20]:
!open .