In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
from collections import deque

# Actor-Critic Network Definition
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        
        # Actor head
        self.actor_fc = nn.Linear(hidden_size, action_dim)
        self.actor_softmax = nn.Softmax(dim=-1)
        
        # Critic head
        self.critic_fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        action_probs = self.actor_softmax(self.actor_fc(x))
        state_value = self.critic_fc(x)
        return action_probs, state_value

# A2C Algorithm Implementation
class A2C:
    def __init__(self, state_dim, action_dim, learning_rate=0.01, gamma=0.99):
        self.gamma = gamma
        self.actor_critic = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate)
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action_probs, _ = self.actor_critic(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        return action.item(), action_dist.log_prob(action)
    
    def update_policy(self, rewards, log_probs, state_values):
        G = 0
        policy_loss = []
        value_loss = []
        for log_prob, value, reward in zip(reversed(log_probs), reversed(state_values), reversed(rewards)):
            G = reward + self.gamma * G
            advantage = G - value.item()
            policy_loss.append(-log_prob * advantage)
            value_loss.append(nn.functional.mse_loss(value, torch.tensor([G], dtype=torch.float32)))
        
        self.optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
        loss.backward()
        self.optimizer.step()

# Train A2C on CartPole-v1
def train_a2c(env_name='CartPole-v1', episodes=1000):
    env = gym.make(env_name)
    agent = A2C(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
    reward_history = []
    
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        rewards = []
        log_probs = []
        state_values = []
        
        for t in range(200):
            action, log_prob = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            _, state_value = agent.actor_critic(torch.tensor(state, dtype=torch.float32))
            
            rewards.append(reward)
            log_probs.append(log_prob)
            state_values.append(state_value)
            
            state = next_state
            episode_reward += reward
            if done:
                break
        
        agent.update_policy(rewards, log_probs, state_values)
        reward_history.append(episode_reward)
        if episode % 50 == 0:
            print(f"Episode {episode}, Reward: {episode_reward}")
    
    env.close()
    return reward_history

# Run training and plot results
reward_history = train_a2c()
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("A2C Training Performance")
plt.show()


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
import torch.multiprocessing as mp
from collections import deque

# Actor-Critic Network Definition
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        
        # Actor head
        self.actor_fc = nn.Linear(hidden_size, action_dim)
        self.actor_softmax = nn.Softmax(dim=-1)
        
        # Critic head
        self.critic_fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        action_probs = self.actor_softmax(self.actor_fc(x))
        state_value = self.critic_fc(x)
        return action_probs, state_value

# Worker process for A3C
def worker(worker_id, global_model, optimizer, env_name, gamma, episodes, results):
    env = gym.make(env_name)
    local_model = ActorCritic(env.observation_space.shape[0], env.action_space.n)
    local_model.load_state_dict(global_model.state_dict())
    
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        rewards = []
        log_probs = []
        state_values = []
        
        for t in range(200):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs, state_value = local_model(state_tensor)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            
            next_state, reward, done, _ = env.step(action.item())
            
            rewards.append(reward)
            log_probs.append(action_dist.log_prob(action))
            state_values.append(state_value)
            
            state = next_state
            episode_reward += reward
            if done:
                break
        
        # Compute returns and update global model
        G = 0
        policy_loss = []
        value_loss = []
        for log_prob, value, reward in zip(reversed(log_probs), reversed(state_values), reversed(rewards)):
            G = reward + gamma * G
            advantage = G - value.item()
            policy_loss.append(-log_prob * advantage)
            value_loss.append(nn.functional.mse_loss(value, torch.tensor([G], dtype=torch.float32)))
        
        optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
        loss.backward()
        for global_param, local_param in zip(global_model.parameters(), local_model.parameters()):
            global_param.grad = local_param.grad
        optimizer.step()
        
        local_model.load_state_dict(global_model.state_dict())
        results[worker_id].append(episode_reward)
    
    env.close()

# Train A3C on CartPole-v1
def train_a3c(env_name='CartPole-v1', episodes=1000, num_workers=4):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    env.close()
    
    global_model = ActorCritic(state_dim, action_dim)
    global_model.share_memory()
    optimizer = optim.Adam(global_model.parameters(), lr=0.001)
    gamma = 0.99
    
    results = [mp.Manager().list() for _ in range(num_workers)]
    processes = []
    for worker_id in range(num_workers):
        p = mp.Process(target=worker, args=(worker_id, global_model, optimizer, env_name, gamma, episodes // num_workers, results))
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()
    
    reward_history = [reward for worker_rewards in results for reward in worker_rewards]
    return reward_history

# Run training and plot results
if __name__ == "__main__":
    mp.set_start_method('spawn')
    reward_history = train_a3c()
    plt.plot(reward_history)
    plt.xlabel("Episodes")
    plt.ylabel("Total Reward")
    plt.title("A3C Training Performance")
    plt.show()
