In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from collections import deque
import csv

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        
        self.features = nn.Sequential(
            nn.Linear(state_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU()
        )
        
        self.actor_features = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU()
        )
        
        self.critic_features = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU()
        )
        
        self.actor = nn.Sequential(
            nn.Linear(256, action_dim),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Linear(256, 1)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.orthogonal_(module.weight, gain=np.sqrt(2))
            module.bias.data.fill_(0.0)
        
    def forward(self, state):
        features = self.features(state)
        actor_feat = self.actor_features(features)
        critic_feat = self.critic_features(features)
        action_probs = self.actor(actor_feat)
        value = self.critic(critic_feat)
        return action_probs, value
    
    def get_action(self, state, deterministic=False):
        action_probs, value = self.forward(state)
        
        if deterministic:
            return torch.argmax(action_probs, dim=-1)
        
        dist = Categorical(action_probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        return action, log_prob, value

    def evaluate_actions(self, state, action):
        action_probs, value = self.forward(state)
        dist = Categorical(action_probs)
        log_prob = dist.log_prob(action)
        entropy = dist.entropy().mean()
        return log_prob, entropy, value

In [None]:
class PPO:
    def __init__(self, state_dim, action_dim):
        self.actor_critic = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=1e-4, eps=1e-5)
        
        self.gamma = 0.99
        self.gae_lambda = 0.95
        self.clip_range = 0.2
        self.epochs = 4
        self.batch_size = 128
        
        self.vf_coef = 0.5
        self.ent_coef = 0.01
        self.max_grad_norm = 0.5
    
    def act(self, state, deterministic=False):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action = self.actor_critic.get_action(state_tensor, deterministic=deterministic)[0]
        return action.item()
        
    def update(self, rollouts):
        observations = torch.FloatTensor(rollouts['states'])
        actions = torch.LongTensor(rollouts['actions'])
        returns = torch.FloatTensor(rollouts['returns'])
        advantages = torch.FloatTensor(rollouts['advantages'])
        old_log_probs = torch.FloatTensor(rollouts['log_probs'])
        
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        for _ in range(self.epochs):
            indices = np.random.permutation(len(observations))
            
            for start in range(0, len(observations), self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                
                obs_batch = observations[batch_indices]
                actions_batch = actions[batch_indices]
                returns_batch = returns[batch_indices]
                advantages_batch = advantages[batch_indices]
                old_log_probs_batch = old_log_probs[batch_indices]
                
                new_log_probs, entropy, values = self.actor_critic.evaluate_actions(
                    obs_batch, actions_batch
                )
                
                ratio = torch.exp(new_log_probs - old_log_probs_batch)
                surr1 = ratio * advantages_batch
                surr2 = torch.clamp(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) * advantages_batch
                
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = self.vf_coef * ((values.squeeze() - returns_batch) ** 2).mean()
                entropy_loss = -self.ent_coef * entropy
                
                total_loss = policy_loss + value_loss + entropy_loss
                
                self.optimizer.zero_grad()
                total_loss.backward()
                nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm)
                self.optimizer.step()

    def save(self, path):
        torch.save(self.actor_critic.state_dict(), path)
    
    def load(self, path):
        self.actor_critic.load_state_dict(torch.load(path))

In [None]:
def compute_returns_and_advantages(rewards, values, next_value, dones, gamma=0.99, gae_lambda=0.95):
    advantages = []
    last_advantage = 0
    last_value = next_value

    for r, v, done in zip(reversed(rewards), reversed(values), reversed(dones)):
        delta = r + gamma * last_value * (1 - done) - v
        advantage = delta + gamma * gae_lambda * (1 - done) * last_advantage
        advantages.insert(0, advantage)
        last_advantage = advantage
        last_value = v

    returns = np.array(advantages) + np.array(values)
    advantages = np.array(advantages)
    
    return returns, advantages

def evaluate(agent, env, num_episodes=100, max_steps=1000):
    all_rewards = []
    all_steps = []
    
    # Create evaluation CSV file
    with open('evaluation_results.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Steps'])
    
    for i_episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
            action = agent.act(state, deterministic=True)  # Use deterministic actions for evaluation
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            
            if terminated or truncated:
                break
            state = next_state
        
        all_rewards.append(episode_reward)
        all_steps.append(step + 1)
        
        # Save episode results
        with open('evaluation_results.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([i_episode + 1, episode_reward, step + 1])
    
    avg_reward = np.mean(all_rewards)
    avg_steps = np.mean(all_steps)
    std_reward = np.std(all_rewards)
    
    print(f"\nEvaluation over {num_episodes} episodes:")
    print(f"Average Reward: {avg_reward:.2f} ± {std_reward:.2f}")
    print(f"Average Episode Length: {avg_steps:.2f}")
    
    return avg_reward, avg_steps

In [None]:
def train():
    # Create CSV file for training progress
    with open('lunar_lander_rewards.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Mean_100'])
    
    env = gym.make('LunarLander-v3')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    
    ppo = PPO(state_dim, action_dim)
    
    timesteps_per_batch = 4096
    num_episodes = 3000  # Fixed number of episodes
    best_mean_reward = -float('inf')
    
    recent_rewards = deque(maxlen=100)
    episode_num = 0
    
    while episode_num < num_episodes:
        states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
        batch_steps = 0
        
        while batch_steps < timesteps_per_batch and episode_num < num_episodes:
            state, _ = env.reset()
            episode_reward = 0
            done = False
            
            while not done and batch_steps < timesteps_per_batch:
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                
                with torch.no_grad():
                    action, log_prob, value = ppo.actor_critic.get_action(state_tensor)
                
                next_state, reward, terminated, truncated, _ = env.step(action.item())
                done = terminated or truncated
                
                states.append(state)
                actions.append(action.item())
                rewards.append(reward)
                dones.append(done)
                log_probs.append(log_prob.item())
                values.append(value.item())
                
                state = next_state
                episode_reward += reward
                batch_steps += 1
            
            episode_num += 1
            recent_rewards.append(episode_reward)
            mean_reward = np.mean(recent_rewards)
            
            # Print and save episode results
            print(f"Episode {episode_num}: Reward = {episode_reward:.2f}, Mean (100) = {mean_reward:.2f}")
            
            with open('lunar_lander_rewards.csv', 'a', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([episode_num, episode_reward, mean_reward])
            
            if mean_reward > best_mean_reward and len(recent_rewards) == 100:
                best_mean_reward = mean_reward
                ppo.save('best_lunar_model.pt')
                print(f"New best model saved with mean reward: {best_mean_reward:.2f}")
        
        with torch.no_grad():
            next_value = ppo.actor_critic(
                torch.FloatTensor(next_state).unsqueeze(0)
            )[1].item()
            
        returns, advantages = compute_returns_and_advantages(
            rewards, values, next_value, dones,
            gamma=ppo.gamma, gae_lambda=ppo.gae_lambda
        )
        
        rollouts = {
            'states': np.array(states),
            'actions': np.array(actions),
            'returns': returns,
            'advantages': advantages,
            'log_probs': np.array(log_probs)
        }
        
        ppo.update(rollouts)
    
    # Save final model
    ppo.save('final_lunar_model.pt')
    print("Training completed. Final model saved.")
    
    # Load best model and evaluate
    print("\nEvaluating best model...")
    ppo.load('best_lunar_model.pt')
    evaluate(ppo, env)
    
    env.close()

In [None]:
if __name__ == "__main__":
    train()

Episode 1: Reward = -304.75, Mean (100) = -304.75
Episode 2: Reward = -208.65, Mean (100) = -256.70
Episode 3: Reward = -375.62, Mean (100) = -296.34
Episode 4: Reward = -464.66, Mean (100) = -338.42
Episode 5: Reward = -309.05, Mean (100) = -332.55
Episode 6: Reward = -318.25, Mean (100) = -330.16
Episode 7: Reward = -329.54, Mean (100) = -330.07
Episode 8: Reward = -292.66, Mean (100) = -325.40
Episode 9: Reward = -316.92, Mean (100) = -324.45
Episode 10: Reward = -457.69, Mean (100) = -337.78
Episode 11: Reward = -424.70, Mean (100) = -345.68
Episode 12: Reward = -559.22, Mean (100) = -363.48
Episode 13: Reward = -216.38, Mean (100) = -352.16
Episode 14: Reward = -459.50, Mean (100) = -359.83
Episode 15: Reward = -265.16, Mean (100) = -353.52
Episode 16: Reward = -569.20, Mean (100) = -367.00
Episode 17: Reward = -515.33, Mean (100) = -375.72
Episode 18: Reward = -452.66, Mean (100) = -380.00
Episode 19: Reward = -206.64, Mean (100) = -370.87
Episode 20: Reward = -547.07, Mean (100)

  self.actor_critic.load_state_dict(torch.load(path))



Evaluation over 100 episodes:
Average Reward: 281.88 ± 20.30
Average Episode Length: 206.43
