In [399]:
import torch
from torch import cuda, device, distributions
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gymnasium as gym
import os, random
from collections import deque

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from IPython.display import clear_output

In [400]:
env_id = "CartPole-v1"
env = gym.make(env_id)#,render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print(f'The State Space is {s_size}')
print(f'The Action Space is {a_size}')
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is 4
The Action Space is 2
Sample observation [-1.6941745e+00  3.1993532e+38  3.8102701e-01  2.3138998e+38]


In [401]:
device = device("cuda:0" if cuda.is_available() else "cpu")

In [402]:
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(ActorCritic, self).__init__()
        self.actor_layer = nn.Linear(state_size, hidden_size)
        self.actor_output = nn.Linear(hidden_size, action_size)
        
        self.critic_layer = nn.Linear(state_size, hidden_size)
        self.critic_output = nn.Linear(hidden_size, 1)
    
    def forward(self, obs):
        
        obs = torch.clamp(obs,-1.1,1.1)
        pi = F.relu(self.actor_layer(obs))
        value = F.relu(self.critic_layer(obs))
        
        return F.softmax(self.actor_output(pi),dim = 1), self.critic_output(value)

In [403]:
hidden_layer = 16
gamma = 0.99
ac_lr = 0.001
episodes = 100_000
avg_win_size = 50
epi_results = deque(maxlen=avg_win_size)

agent = ActorCritic(s_size, a_size, hidden_layer).to(device)
optimizer = optim.Adam(agent.parameters(), lr = ac_lr)

In [404]:
s = env.reset()[0]
obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

p_vals, values = agent(obs)

In [405]:
s

array([ 0.03030152,  0.00010701,  0.01556824, -0.01421367], dtype=float32)

In [406]:
for epi in range(10):

    s = env.reset()[0]
    done , trunc = False, False
    states, rewards, nxt_states , actions, dones  = [], [], [], [], []
    win = 0

    while not any([done, trunc]):

        states.append(s)
        obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

        with torch.no_grad():
            p_vals, _ = agent(obs)
            #value = value.detach().cpu().numpy()[0,0]
            p_vals = torch.squeeze(p_vals)

        p_vals = p_vals.detach().cpu().numpy()
        a = np.random.choice(a_size, p=p_vals)

        s_, r, term ,trunc, _  = env.step(a)
        actions.append(a)
        rewards.append(r)
        dones.append(int(done))
        new_states.append(s_)
        s = s_
    
    epi_results.append(np.sum(rewards))
    
    nxt_states_t = torch.FloatTensor(new_states).to(device)
    states_t = torch.FloatTensor(states).to(device)
    return_t = torch.FloatTensor(rewards).to(device).view(-1,1)
    action_t = torch.LongTensor(actions).to(device).view(-1,1)
    
    '''
    p_vals , values = agent(states_t)
    action_prob = p_vals.gather(1, action_t)

    _, qval = agent(nxt_states_t)
    qval = qval[-1].item()

    #q_vals = np.zeros(values.shape[0])
    q_vals = torch.zeros_like(values, device=device )

    for t in reversed(range(values.shape[0])):

        qval = rewards[t] + gamma * qval
        q_vals[t] = qval

    adv = q_vals - values
    action_prob = p_vals.gather(1, action_t)

    actor_loss = torch.mean(-torch.log(action_prob) * adv)
    critic_loss = adv.pow(2).mean()

    loss = actor_loss + critic_loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epi%100==0:
        clear_output()
    if epi%10==0:
        print(f'epi:{epi:05d} reward:{np.sum(rewards):8.2f} mean_rewards:{np.mean(epi_results):8.2f}')
    if np.mean(np.mean(epi_results))>=500:
        break
    '''

In [409]:
trunc

True

In [385]:
_, qval = agent(torch.FloatTensor(np.expand_dims(s_,0)).to(device))
q_val = qval.item()

In [345]:
p_vals , values = agent(states_t)
action_prob = p_vals.gather(1, action_t)

_, qval = agent(nxt_states_t)
qval = qval[-1].item()

#q_vals = np.zeros(values.shape[0])
q_vals = torch.zeros_like(values, device=device )

for t in reversed(range(values.shape[0])):
    
    qval = rewards[t] + gamma * qval
    q_vals[t] = qval

adv = q_vals - values
action_prob = p_vals.gather(1, action_t)

actor_loss = torch.mean(-torch.log(action_prob) * adv)
critic_loss = adv.pow(2).mean()

In [348]:
loss = actor_loss + critic_loss
loss

tensor(304427.0312, device='cuda:0', grad_fn=<AddBackward0>)

In [308]:
q_vals = q_vals.to(device).view(-1,1)

AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [329]:
k = torch.zeros_like(values, dtype=torch.float64, device=device )

In [330]:
k

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='cuda:0', dtype=torch.float64)

In [321]:
k

tensor([[1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]], dtype=torch.int32)

In [80]:
new_states_t = torch.FloatTensor(new_states).to(device)
states_t = torch.FloatTensor(states).to(device)
return_t = torch.FloatTensor(rewards).to(device).view(-1,1)
action_t = torch.LongTensor(actions).to(device).view(-1,1)

p_vals , values = agent(states_t)
q_vals = return_t + gamma*agent(new_states_t)[1]
adv = q_vals - values
action_prob = p_vals.gather(1, action_t)

actor_loss = torch.mean(-torch.log(action_prob) * adv)
critic_loss = adv.pow(2).mean()

loss = actor_loss + critic_loss

In [91]:
critic_loss

tensor(1.0140, device='cuda:0', grad_fn=<MeanBackward0>)

In [87]:
-torch.log(action_prob)

tensor([[0.5817],
        [0.5641],
        [0.8634],
        [0.5633],
        [0.5472],
        [0.5158],
        [0.9343],
        [0.5121],
        [0.4978],
        [0.9407],
        [0.4961],
        [0.9419],
        [0.4950]], device='cuda:0', grad_fn=<NegBackward0>)

[Chris Yoon](https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f)<br>
[dilithjay](https://dilithjay.com/blog/actor-critic-methods)

In [349]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

# hyperparameters
hidden_size = 256
learning_rate = 3e-4

# Constants
GAMMA = 0.99
num_steps = 500
max_episodes = 100_000
epi_results = deque(maxlen=avg_win_size)

In [350]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_actions = num_actions
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)

        return value, policy_dist

In [354]:
env = gym.make("CartPole-v1")
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.n

actor_critic = ActorCritic(num_inputs, num_outputs, hidden_size)
ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

all_lengths = []
average_lengths = []
all_rewards = []
entropy_term = 0

for episode in range(10):
    log_probs = []
    values = []
    rewards = []

    state = env.reset()
    for steps in range(num_steps):
        value, policy_dist = actor_critic.forward(state)
        value = value.detach().numpy()[0,0]
        dist = policy_dist.detach().numpy() 

        action = np.random.choice(num_outputs, p=np.squeeze(dist))
        log_prob = torch.log(policy_dist.squeeze(0)[action])
        entropy = -np.sum(np.mean(dist) * np.log(dist))
        new_state, reward, done, _ = env.step(action)

        rewards.append(reward)
        values.append(value)
        log_probs.append(log_prob)
        entropy_term += entropy
        state = new_state

        if done or steps == num_steps-1:
            Qval, _ = actor_critic.forward(new_state)
            Qval = Qval.detach().numpy()[0,0]
            all_rewards.append(np.sum(rewards))
            all_lengths.append(steps)
            average_lengths.append(np.mean(all_lengths[-10:]))
            if episode % 10 == 0:                    
                sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
            if episode % 100 == 0:
                clear_output()
            break
    
    epi_results.append(np.sum(rewards))
    '''
    # compute Q values
    Qvals = np.zeros_like(values)
    for t in reversed(range(len(rewards))):
        Qval = rewards[t] + GAMMA * Qval
        Qvals[t] = Qval

    #update actor critic
    values = torch.FloatTensor(values)
    Qvals = torch.FloatTensor(Qvals)
    log_probs = torch.stack(log_probs)
    
    advantage = Qvals - values
    actor_loss = (-log_probs * advantage).mean()
    critic_loss = 0.5 * advantage.pow(2).mean()
    ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

    ac_optimizer.zero_grad()
    ac_loss.backward()
    ac_optimizer.step()
    
    if np.mean(np.mean(epi_results))>=500:
        break

    '''

In [357]:
Qvals = np.zeros_like(values)
for t in reversed(range(len(rewards))):
    Qval = rewards[t] + GAMMA * Qval
    Qvals[t] = Qval

#update actor critic
values = torch.FloatTensor(values)
Qvals = torch.FloatTensor(Qvals)
log_probs = torch.stack(log_probs)

In [360]:
advantage = Qvals - values
actor_loss = (-log_probs * advantage).mean()

In [363]:
critic_loss = 0.5 * advantage.pow(2).mean()

In [365]:
actor_loss

tensor(6.9053, grad_fn=<MeanBackward0>)

In [367]:
critic_loss + actor_loss

tensor(97.0633, grad_fn=<AddBackward0>)