### This code implements the Batch Actor Critic Algorithm for continuous action spaces
![image.png](images/A2C.png) 
![image.png](images/Anatomy.png)
for this implementation is based on the structure shown above </br >  
1. Generate Data by running the most recent policy, specifically this step should return states, rewards, and actions for each batch of training episode
2. Return estimation by any method, in this case n-step TD 
3. In this step the gradient ascent will be performed on the policy after sampling the gradient and taking the <br>
of the sample <br>
$$\theta = \theta + \alpha\nabla{J(\theta}) $$ where J is the RL objective<br>
Entropy maximization was used to improve exploration and robustness

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as opt
import torch.nn.functional as F
import numpy as np
import numpy.random as random
from torch.utils.tensorboard import SummaryWriter

In [None]:
# create Network
class Actor_Network(nn.Module):
    def __init__(self):
        super(Actor_Network, self).__init__()
        # layer 1
        self.fc1 = nn.Linear(in_features=2, out_features=64)
        # layer 2
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        # mean head
        self.mu = nn.Linear(in_features=32, out_features=1)
        # variance head
        self.var = nn.Linear(in_features=32, out_features=1)
        
        
    def forward(self, x):
        # shared layers
        # layer 1
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        
        return self.mu(x), F.softplus(self.var(x))


In [None]:
# create Network
class Critic_Network(nn.Module):
    def __init__(self):
        super(Critic_Network, self).__init__()
        
        self.fc1 = nn.Linear(in_features=2, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=1)
        
    def forward(self, x):
        # layefile:///home/faisal/Documents/ML%20practice/Deep%20Learning/pytorch/REINFORCE/REINFORCE.ipynbr 1
        x = F.relu(self.fc1(x))
        # layer 2
        x = F.relu(self.fc2(x))
        # output layer
        x = self.out(x)
        return x
    def softmax(self, logits):
        return F.softmax(logits, dim=1)

In [None]:
# cartpole problem with discrete actions
# Network archetacture

class SampleGeneration():
    @staticmethod
    @torch.no_grad()
    def generate_samples(network, env='CartPole-v0', N=4):
        states_N = []
        actions_N = []
        rewards_N = []
        env = gym.make(env)
        for trajectory in range(N):
            state = env.reset()
            states_N.append(state)
            rewards = []
            done = False
            while not done:
                state_t = torch.tensor(state.astype(np.float32)).unsqueeze(0)
                action_logits = network(state_t)
                actions_prob = network.softmax(action_logits)
                action = torch.multinomial(actions_prob, 1).item()
                actions_N.append(action)
                state, reward, done, _ = env.step(action)
                rewards.append(reward)
                if not done:
                    states_N.append(state)
            rewards_N.append(np.array(rewards))
            state_stack = np.stack(states_N)
        return (state_stack, rewards_N, np.array(actions_N))
    @staticmethod
    @torch.no_grad()
    def generate_samples_continuous(network, env='MountainCarContinuous-v0', N=4):
        states_N = []
        actions_N = []
        rewards_N = []
        
        env = gym.make(env)
        for trajectory in range(N):
            state = env.reset()
            states_N.append(state)
            rewards = []
            done = False
            while not done:
                state_t = torch.tensor(state.astype(np.float32)).unsqueeze(0)
                # forward pass to generate mu and var
                mu_t, var_t = network(state_t)
                mu = mu_t.detach().item()
                var = var_t.detach().item()
                std = np.sqrt(var)
                # sample an action from a normal distribution policy based on mu and var
                action = random.normal(mu, std)
                np.clip(action,-1,1)
                actions_N.append(action)
                state, reward, done, _ = env.step(np.array([action]))
                rewards.append(reward)
                if not done:
                    states_N.append(state)
            rewards_N.append(np.array(rewards))
            state_stack = np.stack(states_N)
        return (state_stack, rewards_N, np.array(actions_N))

In [None]:
def list_to_torch_tensor(List):
    l = []
    for element in List:
        for r in element:
            l.append(r)
    return torch.tensor(l, dtype=torch.float32)

In [None]:
@torch.no_grad()
def n_step_TD(net,states,rewards,n):
    res = []
    gamma = 0.99
    T = len(rewards)
    for t in range(T):
        sum_r = 0
        taw = t - n
        if taw < 0:
            res.append(rewards[t])
        if taw >= 0:
            for i in range(taw,min(taw+n,T)):
                sum_r += (gamma**(i-taw) * rewards[i])
            if (taw+n) < T:
                sum_r += gamma**n * net(torch.tensor(states[taw+n], dtype=torch.float32)).item()
            res.append(sum_r)
    res.reverse()
    return res

In [None]:
class ReturnEstimator():
    # reward_to_go
    @staticmethod
    def estimate_return(rewards):
        gamma = 0.99
        res = [[] for i in range(len(rewards))]
        for i in range(len(rewards)):
            sum_r = 0.0
            for r in rewards[i]:
                sum_r *= gamma
                sum_r += r
                res[i].append(sum_r)
            res[i].reverse()
        return np.array(res)
    @staticmethod
    def n_step_batach(net,states,rewards_n,n):
        res = []
        for i in range(len(rewards_n)):
            if i == 0:
                cur_states = states[i:len(rewards_n[i])]
            else:
                cur_states = states[len(rewards_n[i-1]):len(rewards_n[i])+len(rewards_n[i-1])]
            res.append(n_step_TD(net,cur_states,rewards_n[i],n))
        return np.array(res)
    @staticmethod
    def fit_v(net, states, targets, opt, tb, step):
        states_t = torch.tensor(states,dtype=torch.float32)
        targets_t = list_to_torch_tensor(targets)
        opt.zero_grad()
        preds = net(states_t)
        loss = F.mse_loss(preds.squeeze(1), targets_t)
        tb.add_scalar('val_loss', loss, step)
#         print(loss)
        loss.backward()
        opt.step()
    @torch.no_grad()
    def calc_adv(net, targets, states):
        states_t = torch.FloatTensor(states)
        targets_t = list_to_torch_tensor(targets)
        values = net(states_t)
        adv = targets_t - values
        return adv
    @torch.no_grad()
    def calc_adv_from_rewards(value_net,rewards,states, gamma=0.99):
        advs = []
        for i in range(len(rewards)):
            eps_len = len(rewards[i])
            last_state_reward = rewards[i][-1]
#             rewards_n = np.array(rewards[i][:-1], dtype=np.float32)
            rewards_t = torch.tensor(np.array(rewards[i][:-1]), dtype=torch.float32)
            if i == 0:
                cur_states = states[i:len(rewards[i])]
            else:
                cur_states = states[len(rewards[i-1]):len(rewards[i])+len(rewards[i-1])]
#             print(next_states_t.shape)
            states_t = torch.FloatTensor(cur_states[0:eps_len-1])
            next_states_t = torch.FloatTensor(cur_states[1:eps_len])
            states_values_t = value_net(states_t)
            next_states_values_t = value_net(next_states_t)
            rewards_t = rewards_t.reshape(-1,1)
            adv_t = (rewards_t + (gamma * next_states_values_t)) - states_values_t
            print(adv_t.shape)
            adv_n = adv_t.detach().numpy().tolist()
            adv_n.append([last_state_reward])
            advs.append(adv_n)
        return np.array(advs)

In [None]:
def improve_policy(network, states, adv, actions, optimizer, tb, step):
    
    optimizer.zero_grad()
    states_t = torch.tensor(states, dtype=torch.float32)
    tb.add_scalar('adv',torch.mean(adv),step)
    actions_t = torch.tensor(actions, dtype=torch.float32)
    
    mu_t, var_t = network(states_t) # mean
    actions_log_probs_term1 = - (actions_t - mu_t)**2 / (2*var_t)
    actions_log_probs_term2 = - torch.log(torch.sqrt(2 * np.pi * var_t  ))
    actions_log_probs = actions_log_probs_term1 + actions_log_probs_term2
    weighted_actions_log_probs = actions_log_probs * adv
    loss_policy =  - weighted_actions_log_probs.mean()
    
    entropy =  - (torch.log(2 * np.pi * var_t) + 1).mean()
    loss_entropy = - 0.0001 * entropy
    loss = loss_policy + loss_entropy
    tb.add_scalar('loss', loss, step)
    tb.add_scalar('entropy', entropy, step)
    tb.add_scalar('entropy loss', loss_entropy, step)
    tb.add_scalar('policy loss', loss_policy, step)
    loss.backward()
    optimizer.step()

In [None]:
@torch.no_grad()
def test_policy(network, mean, std, env="MountainCarContinuous-v0", render=False):
    runs = 1
    total_reward = 0.0
    env = gym.make(env)
    env = gym.wrappers.Monitor(env, "recording")
    for run in range(runs):
        state = env.reset()
        done = False
        for i in range(1000):
            if render:
                env.render()
            state = normalize(state, mean, std)
            state_t = torch.tensor(state, dtype=torch.float32)
            action,_ = network(state_t)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
    env.close()
    return total_reward / runs

In [None]:
policy = Actor_Network()
critic = Critic_Network()
states, rewards, actions = SampleGeneration.generate_samples_continuous(policy, N=4)
estomatore = ReturnEstimator(mean, std)
# tb = SummaryWriter(comment=f"-lr={lr}")
# tb.add_graph(policy, states)
sum_rewards = ReturnEstimator.estimate_return(rewards)
ReturnEstimator.fit_v(critic, states, sum_rewards, crt_optimizer, tb, i)
ReturnEstimator.calc_adv_from_rewards(critic, rewards, states)
len(rewards[0])

In [None]:
# input normalization
def normalization_params(env="MountainCarContinuous-v0"):
    env = gym.make(env)
    samples = []
    for i in range(1000):
        samples.append(env.observation_space.sample())
    samples_np = np.array(samples)
    return (samples_np.mean(axis=0), samples_np.std(axis=0))

In [None]:
mean, std = normalization_params()

In [None]:
def normalize(state, mean, std):
    return (state - mean) / std

In [None]:
# policy imporvment
# hyperparameters
seeds = [1,4,10,42]
episodes_num = 30000
N = 8
save_path = '/home/faisal/Documents/ML practice/Deep Learning/pytorch/Contunous_control/policy.pt'
#######
env = "MountainCarContinuous-v0"
rewards_100 = []
lrs = [0.0003, 0.00001, 0.000001]
for lr in lrs:
    policy = Actor_Network()
    critic = Critic_Network()
    tb = SummaryWriter(comment=f"-lr={lr}")
    states = torch.tensor(gym.make(env).reset(), dtype=torch.float32)
    tb.add_graph(policy, states)
    print(policy)
    act_optimizer = opt.Adam(policy.parameters(), lr=lr)
    crt_optimizer = opt.Adam(critic.parameters(), lr=lr)
    c = 0
    test_reward = 0
    for i in range(episodes_num):
        # run the policy
        if test_reward > 80 and c < 1:
            lr = 5e-5
            print('lr changed to 5e-5')
            act_optimizer = opt.Adam(policy.parameters(), lr=lr)
            crt_optimizer = opt.Adam(critic.parameters(), lr=lr)
            c = 1
        states, rewards, actions = SampleGeneration.generate_samples_continuous(policy, N=N, env=env)
        states = normalize(states, mean, std)
        # estimate the return
        td_targets = ReturnEstimator.n_step_batach(critic, states, rewards,6)
#         sum_rewards = ReturnEstimator.estimate_return(rewards)
        ReturnEstimator.fit_v(critic, states, td_targets, crt_optimizer, tb, i)
#         adv = ReturnEstimator.calc_adv(critic, rewards, states)
#         adv = list_to_torch_tensor(adv)
        adv = ReturnEstimator.calc_adv(critic, td_targets, states)
        # imporove the polciy
        improve_policy(policy, states, adv, actions, act_optimizer, tb, i )
        # test the policy
        test_reward = test_policy(policy, mean, std, env=env)
        rewards_100.append(test_reward)
        tb.add_scalar('reward', test_reward, i)

        if len(rewards_100) >= 100:
            reward_100 = sum(rewards_100) / 100.0
            tb.add_scalar('reward_100', reward_100, i)
            if reward_100 > 90:
                torch.save(policy.state_dict(), save_path)
            rewards_100 = []
        for name, param in policy.named_parameters():
            tb.add_histogram(f'{name}', param, i)
            tb.add_histogram(f'{name}.grad', param.grad, i)

## Training Dynamics
<img  src="images/adv_loss.png" > 
<img  src="images/reward_val.png" > 

## Agent Performence 
<video  controls width="320" height="240" src="recording/test.mp4" type="video/mp4"/> 

## References
1. CS 285 at UC Berkeley
Deep Reinforcement Learning
http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-6.pdf
2. Reinforcement Learning: An Introduction by Richard S. Sutton
and Andrew G. Barto, chapter 13 http://incompleteideas.net/book/the-book-2nd.html