In [1]:
import numpy as np
from scipy.linalg import eig
import copy
import math

#### Question 1
Write Proof (with precise notation) of the Policy Gradient Theorem

$${\nabla_{\theta}J(\theta) = \int_{S}\rho^{\pi}(s)\int_{A}\nabla_{\theta}\pi(s, a, \theta)Q^{\pi}(s, a)da ds }$$
$${J(\theta) = \int_s p_0(s_0)V^{\pi}(s_0)ds_0 = \int_s p_0(s_0)\int_A\pi(s_0, a_0, \theta)Q^\pi(s_0, a_0)da_0ds_0}$$
$${\nabla_{\theta}J(\theta) = \int_s p_0(s_0)\int_A\nabla_\theta\pi(s_0, a_0, \theta)Q^\pi(s_0, a_0)da_0ds_0 + \int_s p_0(s_0)\int_A\pi(s_0, a_0, \theta)\nabla_\theta Q^\pi(s_0, a_0)da_0ds_0}$$
Now expand ${Q^\pi(s_0, a_0)}$ using Bellman Equation:
$${\nabla_{\theta}J(\theta) = \int_s p_0(s_0)\int_A\nabla_\theta\pi(s_0, a_0, \theta)Q^\pi(s_0, a_0)da_0ds_0 + \int_s p_0(s_0)\int_A\pi(s_0, a_0, \theta)\nabla_\theta (R_{s_0}^{a_0} + \int_s\gamma*P_{s_0, s_1}^{a_0}V^\pi(s_1)d(s_1))da_0ds_0}$$
Note ${\nabla_\theta R_{s_0}^{a_0} = 0}$, and remove it from the expression
$${\nabla_{\theta}J(\theta) = \int_s p_0(s_0)\int_A\nabla_\theta\pi(s_0, a_0, \theta)Q^\pi(s_0, a_0)da_0ds_0 + \int_s p_0(s_0)\int_A\pi(s_0, a_0, \theta)\nabla_\theta (\int_s\gamma*P_{s_0, s_1}^{a_0}V^\pi(s_1)d(s_1))da_0ds_0}$$
Now bring the ${\nabla_\theta}$ inside the integral and apply only on ${V^\pi(s_1)}$
$${\nabla_{\theta}J(\theta) = \int_s p_0(s_0)\int_A\nabla_\theta\pi(s_0, a_0, \theta)Q^\pi(s_0, a_0)da_0ds_0 + \int_s p_0(s_0)\int_A\pi(s_0, a_0, \theta)(\int_s\gamma*P_{s_0, s_1}^{a_0}\nabla_\theta V^\pi(s_1)d(s_1))da_0ds_0}$$

#### Question 2
Derive the score function for softmax policy (for finite set of actions)

$$\pi(s,a|\theta) = \frac{\exp(\theta^T \phi(s,a))}{\sum_b \exp(\theta^T \phi(s,b))}$$
$$\nabla_{\theta} \log \pi(s,a|\theta) = \frac{\sum_b \exp(\theta^T \phi(s,b))}{\exp(\theta^T \phi(s,a))}*\frac{\exp(\theta^T \phi(s,a))\phi(s,a)*\sum_b \exp(\theta^T \phi(s,b)) -\sum_b \exp(\theta^T \phi(s,b))*\phi(s,b)*\exp(\theta^T \phi(s,a))}{\sum_b \exp(\theta^T \phi(s,b))^2}$$
$$\nabla_{\theta} \log \pi(s,a|\theta) = \phi(s,a) - \sum_b \theta^T \phi(s,b)$$

#### Question 3
Derive the score function for gaussian policy (for continuous actions)

$$\pi(s,a|\theta) = \frac{1}{\sqrt{2\pi \sigma^2}}\exp(\frac{a - \theta^T \phi(s)}{-2\sigma^2})$$
$$\nabla_{\theta} \log \pi(s,a|\theta) =\frac{(a - \theta^T \phi(s))\phi(s)}{\sigma^2}$$

#### Question 4
Write code for the REINFORCE Algoithm (Monte-Carlo Policy Gradient Algorithm, i.e., no Critic)

Update $\theta$ by stochastic gradient ascend using PGT  
Using $G_t = \sum_{K = t}^{T}\gamma^{k-t}r_k$ as an unbiased sample of $Q^\pi(s_t, a_t)$

In [1]:
class Reinforce:
    # the episode is a list of tuples (state:state, action:action, reward:reward)
    def __init__(self, episode, hidden_size = 5, gamma = 1, learning_rate = 1e-3, actions = 5):
        self.episode = episode
        self.hidden_size = hidden_size # representation of the length of theta
        self.gamma = gamma
        self.alpha = learning_rate
        self.actions = actions # the number of actions
        self.theta = np.random.random_sample((self.hidden_size, 1))
        self.features = np.random.ramdom_sample(self.actions, self.hidden_size)
        
    def feature(self, state, action):
        return self.features[self.get_number_action(action), self.get_number_state(state)]
    
    def expectation(self, state):
        expectation = 0
        for action in range(self.actions):
            expectation += self.feature(state, action)
        return expectation / self.actions
    
    def score(self, state, action):
        return self.feature(state, action) - self.expectation(state)
    
    def get_G(self, step):
        current_reward = 0
        p = 1
        for i in range(len(self.episode) - step):
            current_reward += self.gamma * p * self.episode[i].reward
            p *= self.gamma
    
    def forward(self):
        for t in range(len(self.episode)):
            g = self.get_G(t) 
            self.theta = self.alpha * np.pow(self.gamma, t) * \
            self.score(self.episode[t].state, self.episode[t].action) * g
        return self.theta

#### Question 5
Write Proof (with proper notation) of the Compatible Function Approximation Theorem

If the following two conditions are satisfied:  
critic gradient is compatible with the Actor score function:
$$\nabla_{\theta} \log \pi(s,a|\theta) = \nabla_{w} Q(s,a,w)$$  
critic parameters w minimize the following mean-squared error:  
$$ \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) (Q_{\pi}(s,a) - Q(s,a,w))^2 da ds$$
Then the Policy Gradient using critic $Q(s,a,w)$ is exact:  
$$\nabla_{\theta} J(\theta) \int_s \rho_{\pi}(s) \int_a \nabla_{\theta} \pi(s,a,\theta) Q(s,a,w)$$
$$ \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) (Q_{\pi}(s,a) - Q(s,a,w)) \nabla_{w} Q(s,a,w) da ds = 0$$
$$ \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) (Q_{\pi}(s,a) - Q(s,a,w)) \nabla_{\theta} \log \pi(s,a;\theta) da ds = 0$$
$$ \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) Q_{\pi}(s,a) \nabla_{\theta} \log \pi(s,a;\theta) da ds = \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) Q(s,a,w) \nabla_{\theta} \log \pi(s,a;\theta) da ds$$
$$ \nabla_{\theta}J(\theta) = \int_s \rho_{\pi}(s) \int_a \pi(s,a,\theta) Q_{\pi}(s,a) \nabla_{\theta} \log \pi(s,a;\theta) da ds $$
$$ \nabla_{\theta}J(\theta) = \int_s \rho_{\pi}(s) \int_a \nabla_{\theta} \pi(s,a;\theta) Q(s,a,w)da ds$$