In [7]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

# --- HELPER FUNCTION FOR ONE-HOT ENCODING ---
def state_to_dqn_input(state):
    # Converts (row, col) into a one-hot vector of size 16
    # Example: (0,0) -> [1, 0, 0, ... 0]
    one_hot = np.zeros(16)
    index = state[0] * 4 + state[1]
    one_hot[index] = 1
    return torch.FloatTensor(one_hot)

def run_a2c():
    env = GridWorld()
    
    # INPUT SIZE CHANGED TO 16 (4x4 grid flattened)
    actor = nn.Sequential(
        nn.Linear(16, 128), 
        nn.ReLU(), 
        nn.Linear(128, 4), 
        nn.Softmax(dim=-1)
    )
    
    critic = nn.Sequential(
        nn.Linear(16, 128), 
        nn.ReLU(), 
        nn.Linear(128, 1)
    )
    
    opt_a = optim.Adam(actor.parameters(), lr=0.003) # Increased LR slightly
    opt_c = optim.Adam(critic.parameters(), lr=0.003)
    
    episodes = 1000
    
    print("-----------------------------------")
    print(f"Training Actor-Critic (A2C) for {episodes} episodes...")
    print("-----------------------------------")
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        steps = 0
        
        while not done:
            # FIX: Use One-Hot Encoding
            state_t = state_to_dqn_input(state)
            
            # Actor decides
            dist = torch.distributions.Categorical(actor(state_t))
            action = dist.sample()
            
            next_state, reward, done = env.step(action.item())
            
            # FIX: Use One-Hot Encoding for next state
            next_state_t = state_to_dqn_input(next_state)
            
            # Critic evaluates
            val = critic(state_t)
            next_val = critic(next_state_t)
            
            # TD Error (Advantage)
            target = reward + 0.99 * next_val.detach() * (not done)
            td_error = target - val
            
            # Updates
            a_loss = -dist.log_prob(action) * td_error.detach()
            c_loss = td_error.pow(2)
            
            opt_a.zero_grad(); a_loss.backward(); opt_a.step()
            opt_c.zero_grad(); c_loss.backward(); opt_c.step()
            
            state = next_state
            total_reward += reward
            steps += 1
            if steps > 100: break # Break infinite loops

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode+1}: Total Reward = {total_reward}")

    print("\nTraining Finished.")

    # --- TESTING PHASE ---
    print("\n-----------------------------------")
    print("Testing Learned Policy (Greedy Mode)")
    print("-----------------------------------")
    
    state = env.reset()
    path = [state]
    done = False
    steps = 0
    
    while not done and steps < 20:
        state_t = state_to_dqn_input(state)
        with torch.no_grad():
            probs = actor(state_t)
            action = torch.argmax(probs).item()
            
        state, _, done = env.step(action)
        path.append(state)
        steps += 1
        
    print("Final Path Taken:", path)
    if path[-1] == (3,3):
        print("RESULT: SUCCESS - Goal Reached!")
    else:
        print("RESULT: FAILED - Did not reach goal.")

if __name__ == "__main__":
    run_a2c()

-----------------------------------
Training Actor-Critic (A2C) for 1000 episodes...
-----------------------------------
Episode 100: Total Reward = 5
Episode 200: Total Reward = 5
Episode 300: Total Reward = 5
Episode 400: Total Reward = 5
Episode 500: Total Reward = 5
Episode 600: Total Reward = 5
Episode 700: Total Reward = 5
Episode 800: Total Reward = 5
Episode 900: Total Reward = 5
Episode 1000: Total Reward = 5

Training Finished.

-----------------------------------
Testing Learned Policy (Greedy Mode)
-----------------------------------
Final Path Taken: [(0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (3, 2), (3, 3)]
RESULT: SUCCESS - Goal Reached!


In [5]:
# exp12_a2c_continuous.py

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym

env = gym.make("Pendulum-v1")
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, act_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(act_dim))

    def forward(self, x):
        mu = self.net(x)
        std = torch.exp(self.log_std)
        return mu, std

class Critic(nn.Module):
    def __init__(self, obs_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

def select_action(actor, state):
    s = torch.tensor(state, dtype=torch.float32)
    mu, std = actor(s)
    dist = torch.distributions.Normal(mu, std)
    a = dist.sample()
    return a.detach().numpy(), dist.log_prob(a).sum()

def train_a2c(episodes=400, gamma=0.99, lr=3e-4):
    actor = Actor(obs_dim, act_dim)
    critic = Critic(obs_dim)
    a_opt = optim.Adam(actor.parameters(), lr=lr)
    c_opt = optim.Adam(critic.parameters(), lr=lr)

    for ep in range(episodes):
        state, _ = env.reset()
        log_probs = []
        values = []
        rewards = []
        done = False

        while not done:
            action, logp = select_action(actor, state)
            value = critic(torch.tensor(state, dtype=torch.float32))
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            log_probs.append(logp)
            values.append(value)
            rewards.append(reward)
            state = next_state

        # compute returns and advantages
        R = 0.0
        returns = []
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns, dtype=torch.float32)
        values = torch.stack(values)
        advantages = returns - values.detach()

        # actor update
        a_opt.zero_grad()
        log_probs_t = torch.stack(log_probs)
        actor_loss = -(log_probs_t * advantages).mean()
        actor_loss.backward()
        a_opt.step()

        # critic update
        c_opt.zero_grad()
        critic_loss = nn.MSELoss()(values, returns)
        critic_loss.backward()
        c_opt.step()

        if (ep + 1) % 50 == 0:
            print(f"Episode {ep+1}, mean return {returns.mean().item():.2f}")

    return actor, critic

if __name__ == "__main__":
    actor, critic = train_a2c()
    print("A2C training completed for Pendulum-v1.")


Episode 50, mean return -516.81
Episode 100, mean return -290.54
Episode 150, mean return -248.91
Episode 200, mean return -336.02
Episode 250, mean return -286.13
Episode 300, mean return -436.40
Episode 350, mean return -223.14
Episode 400, mean return -359.05
A2C training completed for Pendulum-v1.
