In [None]:
import os
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import imageio
from PIL import Image
import PIL.ImageDraw as ImageDraw
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import gymnasium as gym
from collections import deque
import csv

class Policy(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Policy, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.network(x)

class REINFORCE:
    def __init__(self, env, learning_rate=1e-3):
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.policy = Policy(self.input_dim, self.output_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy.to(self.device)
    
    def select_action(self, state, deterministic=False):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        probs = self.policy(state)
        
        if deterministic:
            return torch.argmax(probs).item()
        
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []
        R = 0
        gamma = 0.99
        
        for r in reversed(rewards):
            R = r + gamma * R
            discounted_rewards.insert(0, R)
        
        discounted_rewards = torch.FloatTensor(discounted_rewards).to(self.device)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        policy_loss = []
        for log_prob, R in zip(log_probs, discounted_rewards):
            policy_loss.append(-log_prob * R)
        
        policy_loss = torch.cat(policy_loss).sum()
        
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()
        
        return policy_loss.item()

    def save(self, path):
        torch.save(self.policy.state_dict(), path)
    
    def load(self, path):
        self.policy.load_state_dict(torch.load(path))
        self.policy.to(self.device)

def evaluate(agent, env, num_episodes=100, max_steps=1000):
    all_rewards = []
    all_steps = []
    
    with open('reinforce_evaluation_results.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Steps'])
    
    for i_episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
            action = agent.select_action(state, deterministic=True)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            
            if terminated or truncated:
                break
            state = next_state
        
        all_rewards.append(episode_reward)
        all_steps.append(step + 1)
        
        with open('reinforce_evaluation_results.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([i_episode + 1, episode_reward, step + 1])
    
    avg_reward = np.mean(all_rewards)
    avg_steps = np.mean(all_steps)
    std_reward = np.std(all_rewards)
    
    print(f"\nEvaluation over {num_episodes} episodes:")
    print(f"Average Reward: {avg_reward:.2f} ± {std_reward:.2f}")
    print(f"Average Episode Length: {avg_steps:.2f}")
    
    return avg_reward, avg_steps

def train():
    with open('reinforce_training_rewards.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Episode', 'Reward', 'Mean_100'])
    
    env = gym.make("LunarLander-v3")
    
    agent = REINFORCE(env, learning_rate=1e-3)
    
    num_episodes = 3000  
    recent_rewards = deque(maxlen=100)
    best_mean_reward = -float('inf')
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        rewards = []
        log_probs = []
        
        while True:
            action, log_prob = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            rewards.append(reward)
            log_probs.append(log_prob)
            state = next_state
            
            if terminated or truncated:
                break
        
        policy_loss = agent.update_policy(rewards, log_probs)
        total_reward = sum(rewards)
        recent_rewards.append(total_reward)
        
        mean_reward = np.mean(list(recent_rewards))
        
        with open('reinforce_training_rewards.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([episode + 1, total_reward, mean_reward])
        
        if mean_reward > best_mean_reward and len(recent_rewards) == 100:
            best_mean_reward = mean_reward
            agent.save('best_reinforce_model.pt')
            print(f"New best model saved with mean reward: {best_mean_reward:.2f}")
        
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, Reward: {total_reward:.2f}, Mean (100): {mean_reward:.2f}")
    
    agent.save('final_reinforce_model.pt')
    print("Training completed. Final model saved.")
    
    print("\nEvaluating best model...")
    agent.load('best_reinforce_model.pt')
    evaluate(agent, env)
    
    env.close()
    return agent

if __name__ == "__main__":
    train()

Episode 10, Reward: -204.61, Mean (100): -198.02
Episode 20, Reward: 35.20, Mean (100): -208.29
Episode 30, Reward: -281.48, Mean (100): -200.35
Episode 40, Reward: -380.58, Mean (100): -186.87
Episode 50, Reward: -125.44, Mean (100): -173.40
Episode 60, Reward: -158.63, Mean (100): -172.91
Episode 70, Reward: -213.29, Mean (100): -182.46
Episode 80, Reward: -279.97, Mean (100): -181.46
Episode 90, Reward: -430.32, Mean (100): -186.25
New best model saved with mean reward: -187.16
Episode 100, Reward: -56.29, Mean (100): -187.16
New best model saved with mean reward: -186.68
New best model saved with mean reward: -185.80
Episode 110, Reward: -226.77, Mean (100): -186.37
New best model saved with mean reward: -182.52
New best model saved with mean reward: -180.17
New best model saved with mean reward: -180.03
New best model saved with mean reward: -178.21
New best model saved with mean reward: -176.96
New best model saved with mean reward: -176.38
Episode 120, Reward: -105.52, Mean (100

  self.policy.load_state_dict(torch.load(path))



Evaluation over 100 episodes:
Average Reward: 105.22 ± 79.38
Average Episode Length: 795.34
