# Actor Critic

In [6]:
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [7]:
class CliffWalkingEnv:

    def __init__(self):
        self.num_of_rows = 6
        self.num_of_cols = 10
        self.start_state = (5, 0)
        self.terminal_state = (5, 9)
        self.cliff_states = [(5, i) for i in range(1, 8)]
        self.current_state = None
        self.directions = [
            ((+0, +1), 'up', '⬆'),
            ((+1, +0), 'down', '⬇'),
            ((+0, -1), 'left', '⬅'),
            ((-1, +0), 'right', '➡')
        ]

    def step(self, action):
        x, y = self.current_state
        dx, dy = self.directions[action][0]
        nx, ny = x + dx, y + dy
        next_state = (nx, ny)
        if 0 <= nx < self.num_of_rows and 0 <= ny < self.num_of_cols:
            self.current_state = next_state

        reward = -5.0
        is_terminal = False
        if self.current_state in self.cliff_states:
            reward = -100.0
            self.current_state = deepcopy(self.start_state)
        elif self.current_state == self.terminal_state:
            is_terminal = True
        next_state_id = self.cords_to_state_id(self.current_state)
        return next_state_id, reward, is_terminal

    def reset_state(self):
        self.current_state = self.start_state
        return self.cords_to_state_id(self.current_state)

    def cords_to_state_id(self, state):
        r, c = state
        return r * self.num_of_cols + c

    def print_policy_table(self, actor_model):
        print(f'{f"State/Action":^12}', end='\t')
        for curr_action in self.directions:
            print(f'{curr_action[2]:^10}', end='\t')
        print()

        for r in range(0, self.num_of_rows):
            for c in range(0, self.num_of_cols):
                curr_state_cords = (r, c)
                curr_state = self.cords_to_state_id(curr_state_cords)
                probs = actor_model(curr_state)
                print(f"{f'({r}, {c})':^10}: ", end='\t')
                for prob in probs:
                    print(f"{prob:^10.3f}", end='\t')
                print()

In [8]:
class NeuralNetwork(nn.Module):
    def __init__(self, alpha, in_dims, hl1_dims, hl2_dims, hl3_dims, out_dims):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dims, hl1_dims),
            nn.ReLU(),
            nn.Linear(hl1_dims, hl2_dims),
            nn.ReLU(),
            nn.Linear(hl2_dims, hl3_dims),
            nn.ReLU(),
            nn.Linear(hl3_dims, out_dims),
            nn.Softmax(dim=0)
        )
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)

    def forward(self, x):
        x = torch.FloatTensor(np.array([x]))
        return self.layers(x)

In [9]:
class Agent(object):
    def __init__(self, alpha1, alpha2, in_dims=1, hl1_dims=256, hl2_dims=512, hl3_dims=128, out_dims=2, gamma=0.7):
        self.gamma = gamma
        self.actor = NeuralNetwork(alpha1, in_dims, hl1_dims, hl2_dims, hl3_dims, out_dims)
        self.critic = NeuralNetwork(alpha2, in_dims, hl1_dims, hl2_dims, hl3_dims, 1)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.log_probs = None

    def next_action(self, curr_state, random=False):
        probs = self.actor(curr_state)
        if random:
            probs = torch.FloatTensor([0.25] * 4)
        action_cat_probs = torch.distributions.Categorical(probs)
        action = action_cat_probs.sample()
        self.log_probs = action_cat_probs.log_prob(action)
        return action.item()

    def train(self, curr_state, reward, next_state, done):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()

        critic_curr = self.critic.forward(curr_state)
        critic_next = self.critic.forward(next_state)

        reward = torch.tensor(reward, dtype=torch.float).to(self.device)
        delta = reward + self.gamma * critic_next * (1 - int(done)) - critic_curr

        critic_loss = delta ** 2
        actor_loss = -self.log_probs * delta

        (actor_loss + critic_loss).backward()
        self.actor.optimizer.step()
        self.critic.optimizer.step()

        return actor_loss.item(), critic_loss.item()

In [10]:
env = CliffWalkingEnv()
num_episodes = 25
agent = Agent(alpha1=0.01, alpha2=0.01, in_dims=1, hl1_dims=256, hl2_dims=512, hl3_dims=128, out_dims=4, gamma=0.7)
loss_per_episode = []
eps_init = 0.9
epsilon = 1.0
reward_per_episode = []

print(f"Initial Policy Table")
env.print_policy_table(agent.actor)

Initial Policy Table
State/Action	    ⬆     	    ⬇     	    ⬅     	    ➡     	
  (0, 0)  : 	  0.241   	  0.214   	  0.283   	  0.262   	
  (0, 1)  : 	  0.249   	  0.209   	  0.269   	  0.273   	
  (0, 2)  : 	  0.256   	  0.206   	  0.253   	  0.285   	
  (0, 3)  : 	  0.262   	  0.204   	  0.239   	  0.295   	
  (0, 4)  : 	  0.266   	  0.204   	  0.228   	  0.302   	
  (0, 5)  : 	  0.270   	  0.204   	  0.218   	  0.309   	
  (0, 6)  : 	  0.273   	  0.204   	  0.207   	  0.316   	
  (0, 7)  : 	  0.276   	  0.203   	  0.198   	  0.324   	
  (0, 8)  : 	  0.279   	  0.201   	  0.188   	  0.331   	
  (0, 9)  : 	  0.282   	  0.200   	  0.179   	  0.339   	
  (1, 0)  : 	  0.285   	  0.197   	  0.170   	  0.348   	
  (1, 1)  : 	  0.288   	  0.194   	  0.162   	  0.356   	
  (1, 2)  : 	  0.291   	  0.191   	  0.153   	  0.365   	
  (1, 3)  : 	  0.293   	  0.188   	  0.146   	  0.373   	
  (1, 4)  : 	  0.296   	  0.185   	  0.138   	  0.381   	
  (1, 5)  : 	  0.298   	  0.182   	  0.131   	  0.3

In [11]:
for i in range(num_episodes):
    print(f"Iteration: {i}")
    episode_reward = 0
    terminal = False
    curr_state = env.reset_state()
    actor_loss, critic_loss = 0, 0

    if epsilon > 0.1:
        epsilon = eps_init ** (i + 1)
    while not terminal:
        rand_num = np.random.random(1)[0]
        random = True
        # if rand_num > epsilon:
        #     random = False
        action = agent.next_action(curr_state, random=random)

        next_state, reward, terminal = env.step(action)
        a_loss, c_loss = agent.train(curr_state, reward, next_state, terminal)
        actor_loss += a_loss
        critic_loss += c_loss
        episode_reward += reward
        curr_state = next_state

    reward_per_episode.append(episode_reward)
    loss_per_episode.append((actor_loss, critic_loss))

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24


In [12]:
print(f"Final Policy Table")
env.print_policy_table(agent.actor)

Final Policy Table
State/Action	    ⬆     	    ⬇     	    ⬅     	    ➡     	
  (0, 0)  : 	  0.241   	  0.214   	  0.283   	  0.262   	
  (0, 1)  : 	  0.249   	  0.209   	  0.269   	  0.273   	
  (0, 2)  : 	  0.256   	  0.206   	  0.253   	  0.285   	
  (0, 3)  : 	  0.262   	  0.204   	  0.239   	  0.295   	
  (0, 4)  : 	  0.266   	  0.204   	  0.228   	  0.302   	
  (0, 5)  : 	  0.270   	  0.204   	  0.218   	  0.309   	
  (0, 6)  : 	  0.273   	  0.204   	  0.207   	  0.316   	
  (0, 7)  : 	  0.276   	  0.203   	  0.198   	  0.324   	
  (0, 8)  : 	  0.279   	  0.201   	  0.188   	  0.331   	
  (0, 9)  : 	  0.282   	  0.200   	  0.179   	  0.339   	
  (1, 0)  : 	  0.285   	  0.197   	  0.170   	  0.348   	
  (1, 1)  : 	  0.288   	  0.194   	  0.162   	  0.356   	
  (1, 2)  : 	  0.291   	  0.191   	  0.153   	  0.365   	
  (1, 3)  : 	  0.293   	  0.188   	  0.146   	  0.373   	
  (1, 4)  : 	  0.296   	  0.185   	  0.138   	  0.381   	
  (1, 5)  : 	  0.298   	  0.182   	  0.131   	  0.389