# Policy Gradient

GOLAI Policy Gradient attempt 🙃.

Policy gradients formula:

$$\pi_\theta(s,a)$$

Where:

- $\theta$: is the parameter vector,
- $s$: is a particular state,
- $a$: is an action.

In [None]:
import torch
import torch.nn
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
from game.GOLAI.arena import Arena

In [None]:
learning_rate = 0.01
gamma = 0.99

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(16, 128) # Input shape: state
        self.affine2 = nn.Linear(128, 2) # Output shape: actions

        self.gamma = gamma
        
        self.saved_log_probs = []
        self.rewards = []
 
    def forward(self,x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [None]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [None]:
def select_action(state):
    # Get state in tensor shape (1, ...)
    state = torch.from_numpy(state).float().unsqueeze(0)

    # Get action by running policy model and choosing based on probabilities in state
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()

    # Save action
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()

In [None]:
def finish_episode(gamma):
    R = 0
    policy_loss = []
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        rewards.insert(0, R)
    rewards = torch.tensor(rewards)
    rewards = (reward - rewards.mean()) / (rewards.std() + eps)
    for log_prob, reward in zip(policy.save_log_probs, rewards):
        policy_loss.append(-log_prob * reward)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.save_log_probs[:]

In [None]:
def main(log_interval):
    running_reward = 10
    for i_episode in count(1):
        for t in range(10000):
            action = select_action(state)
            # Play the game
            end_round = play_round(state_new, None)
            score = calculate_score(end_round)
            done = (t == horizon - 1) # When all tiles were placed, we are done
            D.append((state, action, score, state_new, done))
            policy.rewards.append(reward)
            if done:
                break
                
            running_reward = running_reward * 0.99 + t * 0.01
            finish_episode()
            if i_episode % log_interval == 0:
                print("Episode {}\tLast length: {:5d}\tAverage length: {:.2f}".format(i_episode, t, running_reward))
            if running_reward > reward_threshold: # ???
                print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, t))
                break

- [Policy gradient RL in Pytorch](https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf)
- [Reinforce.py](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py)