In [None]:
import numpy as np
import pandas as pd
from collections import deque
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import gymnasium as gym
import csv

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class PolicyNetwork(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(s_size, h_size)
        self.layer2 = nn.Linear(h_size, a_size)

    def forward(self, state):
        hidden = F.relu(self.layer1(state))
        action_probs = F.softmax(self.layer2(hidden), dim=1)
        return action_probs

    def act(self, state):
        probabilities = self.forward(state)
        distribution = Categorical(probabilities)
        action = distribution.sample()
        return action.item(), distribution.log_prob(action)



In [None]:
class ValueNetwork(nn.Module):
    def __init__(self, s_size, h_size):
        super(ValueNetwork, self).__init__()
        self.input_layer = nn.Linear(s_size, h_size)
        self.output_layer = nn.Linear(h_size, 1)

    def forward(self, state):
        hidden = F.relu(self.input_layer(state))
        state_value = self.output_layer(hidden)
        return state_value



In [None]:
def generate_trajectory(policy, value_function, max_steps, env):
    log_probs = []
    rewards = []
    state_values = []
    state, _ = env.reset()
    
    for step in range(max_steps):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action, log_prob = policy.act(state)
        value = value_function(state)
        next_state, reward, terminated, truncated, _ = env.step(action)

        rewards.append(reward)
        log_probs.append(log_prob)
        state_values.append(value)
        state = next_state

        if terminated or truncated:
            break
    
    return log_probs, rewards, state_values

def calculate_discounted_returns(rewards, max_steps, gamma):
    returns = deque(maxlen=max_steps)
    n_steps = len(rewards)
    
    for step in range(n_steps)[::-1]:
        disc_return = (returns[0] if len(returns) > 0 else 0)
        returns.appendleft(rewards[step] + gamma * disc_return)
    return returns

def standardise_returns(returns):
    eps = np.finfo(np.float32).eps.item()
    returns = torch.tensor(returns, dtype=torch.float32).to(device)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    return returns

def optimise_policy(policy_optimizer, log_probs, returns, state_values):
    state_values = torch.stack(state_values).squeeze()
    advantages = returns - state_values.detach()
    advantages = torch.tensor(advantages).to(device)

    policy_loss = []
    for log_prob, advantage in zip(log_probs, advantages):
        policy_loss.append(-log_prob * advantage)
    policy_loss = torch.cat(policy_loss).sum()
    
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()

def optimise_value_function(value_optimizer, returns, state_values):
    state_values = torch.stack(state_values).squeeze()
    value_loss = F.mse_loss(state_values, returns)
    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

In [None]:
def evaluate(policy, env, num_episodes=100, max_steps=1000):
    all_rewards = []
    all_steps = []
    
    with open('a2c_evaluation_results.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Steps'])
    
    for i_episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            action, _ = policy.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            
            if terminated or truncated:
                break
            state = next_state
        
        all_rewards.append(episode_reward)
        all_steps.append(step + 1)
        
        # Save episode results
        with open('a2c_evaluation_results.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([i_episode + 1, episode_reward, step + 1])
    
    avg_reward = np.mean(all_rewards)
    avg_steps = np.mean(all_steps)
    std_reward = np.std(all_rewards)
    
    print(f"\nEvaluation over {num_episodes} episodes:")
    print(f"Average Reward: {avg_reward:.2f} ± {std_reward:.2f}")
    print(f"Average Episode Length: {avg_steps:.2f}")
    
    return avg_reward, avg_steps

In [None]:
def train():
    # Create CSV file for training progress
    with open('a2c_training_rewards.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Mean_100'])

    # Environment setup
    env = gym.make('LunarLander-v3')
    s_size = env.observation_space.shape[0]
    a_size = env.action_space.n

    # Hyperparameters
    h_size = 256
    max_steps = 1000
    gamma = 0.99
    lr = 1e-3
    num_episodes = 3000

    # Initialize networks
    policy = PolicyNetwork(s_size, a_size, h_size).to(device)
    value_function = ValueNetwork(s_size, h_size).to(device)
    policy_optimizer = optim.Adam(policy.parameters(), lr=lr)
    value_optimizer = optim.Adam(value_function.parameters(), lr=lr)

    # Training tracking
    recent_scores = deque(maxlen=100)
    best_mean_reward = float('-inf')

    for episode in range(num_episodes):
        # Generate trajectory
        log_probs, rewards, state_values = generate_trajectory(policy, value_function, max_steps, env)
        episode_score = sum(rewards)
        recent_scores.append(episode_score)
        
        # Calculate returns and update networks
        returns = calculate_discounted_returns(rewards, max_steps, gamma)
        standardised_returns = standardise_returns(returns)
        optimise_value_function(value_optimizer, standardised_returns, state_values)
        optimise_policy(policy_optimizer, log_probs, standardised_returns, state_values)

        # Calculate mean reward
        mean_reward = np.mean(list(recent_scores))

        # Save episode results
        with open('a2c_training_rewards.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([episode + 1, episode_score, mean_reward])

        # Save best model
        if mean_reward > best_mean_reward and len(recent_scores) == 100:
            best_mean_reward = mean_reward
            torch.save({
                'policy_state_dict': policy.state_dict(),
                'value_state_dict': value_function.state_dict()
            }, 'best_a2c_model.pt')
            print(f"\nNew best model saved with mean reward: {best_mean_reward:.2f}")

        # Print progress
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}/{num_episodes}, Reward: {episode_score:.2f}, Mean (100): {mean_reward:.2f}")

    # Save final model
    torch.save({
        'policy_state_dict': policy.state_dict(),
        'value_state_dict': value_function.state_dict()
    }, 'final_a2c_model.pt')
    print("\nTraining completed. Final model saved.")

    # Load best model and evaluate
    print("\nEvaluating best model...")
    checkpoint = torch.load('best_a2c_model.pt')
    policy.load_state_dict(checkpoint['policy_state_dict'])
    value_function.load_state_dict(checkpoint['value_state_dict'])
    evaluate(policy, env)

    env.close()
    return policy, value_function

In [None]:
if __name__ == "__main__":
    train()