In [None]:
import numpy as np

class ContinuousWindyGridworld:
    def __init__(self):
        # Environment bounds (grid size)
        self.x_min, self.x_max = 0, 10  # X-axis limits
        self.y_min, self.y_max = 0, 8   # Y-axis limits
        self.start = np.array([0.5, 3.5])  # Starting position of the agent
        self.goal = np.array([7.5, 3.5])  # Goal position of the agent
        
        # Wind strength along the x-axis (affects the agent's movement)
        self.wind_strength = [0, 0, 0, 0, 1, 1, 2, 2, 1, 0]
        
        # Action space: defines continuous actions (radius and angle)
        # Radius determines how far the agent moves, and angle defines the direction
        self.action_space = {
            'radius': (0.0, 5.0),  # Maximum step size the agent can take
            'angle': (0.0, 2 * np.pi)  # Angle of movement (full circle)
        }
        
        # Initial state of the agent
        self.state = self.start.copy()
    
    def reset(self):
        """Resets the environment to the start state."""
        self.state = self.start.copy()  # Reset agent's position
        return self.state
    
    def step(self, action):
        """
        Takes an action and returns (next_state, reward, done).
        
        action: a tuple containing (radius, angle)
        radius: distance to move
        angle: direction to move in radians
        """
        radius, angle = action  # Unpack action into radius and angle
        
        # Calculate the next state based on the current state and action
        next_state = self.state + np.array([
            radius * np.cos(angle),  # Move in the x-direction
            radius * np.sin(angle)   # Move in the y-direction
        ])
        
        # Add wind effect based on the x-position of the agent
        # Wind strength varies based on the agent's x-coordinate
        column = int(np.clip(next_state[0], self.x_min, self.x_max - 1))  # Ensure x is within bounds
        wind_effect = self.wind_strength[column]  # Get wind strength at the current x-position
        next_state[1] += wind_effect  # Apply wind effect to the y-position
        
        # Clip the next state to ensure the agent stays within the grid boundaries
        next_state[0] = np.clip(next_state[0], self.x_min, self.x_max)  # Clip x to grid bounds
        next_state[1] = np.clip(next_state[1], self.y_min, self.y_max)  # Clip y to grid bounds
        
        self.state = next_state  # Update the agent's state
        
        # Calculate the reward based on the distance to the goal
        distance_to_goal = np.linalg.norm(self.state - self.goal)  # Euclidean distance to the goal
        if distance_to_goal < 0.5:
            reward = 100  # Reward for reaching the goal
            done = True  # The episode ends when the goal is reached
        else:
            reward = -0.1 * distance_to_goal  # Penalize the agent for being far from the goal
            done = False  # The episode continues if the goal is not reached
        
        return self.state, reward, done  # Return the next state, reward, and done status


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as D

class SGDPolicy(nn.Module):
    def __init__(self, state_dim=2, hidden_dim=16):
        """
        Initializes the policy network with a fully connected neural network.

        state_dim: The dimensionality of the state input (default is 2 for (x, y) positions).
        hidden_dim: The number of units in the hidden layer (default is 16).
        """
        super().__init__()
        
        # Define the architecture of the neural network
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),  # Input layer (state_dim -> hidden_dim)
            nn.ReLU(),  # ReLU activation function
            nn.Linear(hidden_dim, hidden_dim),  # Hidden layer
            nn.ReLU(),  # ReLU activation function
            nn.Linear(hidden_dim, 2)  # Output layer: 2 outputs (mean of radius and angle)
        )
        
        # Adam optimizer for training the model's parameters
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        
        # Lists to store actions and rewards during the episode for later update
        self.saved_actions = []  
        self.rewards = []  
        
        # Standard deviation parameters for radius and angle (learnable)
        self.radius_std = nn.Parameter(torch.tensor(1.0))  
        self.angle_std = nn.Parameter(torch.tensor(1.0))  

    def forward(self, state):
        """
        Performs a forward pass through the network to get the mean of the action distributions.
        
        state: The current state input to the network (e.g., position of the agent).
        
        Returns:
        radius_dist: A Normal distribution for the radius (distance to move).
        angle_dist: A Normal distribution for the angle (direction of movement).
        """
        state = torch.FloatTensor(state)  # Convert state to a tensor
        
        # Pass state through the neural network to get raw output
        output = self.network(state)
        
        # Apply sigmoid to scale outputs to valid action ranges (radius and angle)
        radius_mean = torch.sigmoid(output[0]) * 5.0 + 1e-6  # Radius is scaled to (0, 5)
        angle_mean = torch.sigmoid(output[1]) * 2 * torch.pi + 1e-6  # Angle is scaled to (0, 2*pi)
        
        # Define the action distributions with learnable standard deviations
        radius_dist = D.Normal(radius_mean, torch.clamp(torch.exp(self.radius_std), min=1e-6, max=2.0))
        angle_dist = D.Normal(angle_mean, torch.clamp(torch.exp(self.angle_std), min=1e-6, max=2.0))
        
        return radius_dist, angle_dist

    def select_action(self, state):
        """
        Selects an action based on the current state using the policy network.
        
        state: The current state input to the policy network.
        
        Returns:
        radius: The selected radius (distance to move).
        angle: The selected angle (direction of movement).
        """
        radius_dist, angle_dist = self.forward(state)  # Get the action distributions
        
        # Sample from the action distributions to select radius and angle
        radius = radius_dist.sample()
        angle = angle_dist.sample()
        
        # Store the log-probabilities of the actions for later use during the update
        self.saved_actions.append((radius_dist.log_prob(radius), angle_dist.log_prob(angle)))
        
        return radius.item(), angle.item()  # Convert tensor values to Python numbers

    def update(self, gamma=0.99):
        """
        Updates the policy using the collected rewards and actions.
        
        gamma: Discount factor for future rewards (default is 0.99).
        """
        R = 0  # Initialize the return (future discounted reward)
        returns = []  # List to store the returns for each action
        
        # Calculate the returns (discounted rewards) in reverse order (from last to first)
        for r in self.rewards[::-1]:
            R = r + gamma * R  # Discount the reward
            returns.insert(0, R)  # Insert the return at the beginning of the list
        
        returns = torch.tensor(returns)  # Convert returns list to tensor
        
        # Normalize the returns (zero mean and unit variance) to improve training stability
        if returns.std() > 1e-8:  # Avoid division by zero or very small numbers
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        else:
            returns = returns - returns.mean()  # Center the returns, skip scaling

        # Compute the policy loss (Negative log-likelihood times the return)
        policy_loss = []
        for (radius_log_prob, angle_log_prob), R in zip(self.saved_actions, returns):
            # Add up the loss for both radius and angle
            policy_loss.append(-(radius_log_prob + angle_log_prob) * R)
        
        self.optimizer.zero_grad()  # Reset gradients before backpropagation
        policy_loss = torch.stack(policy_loss).sum()  # Sum up all policy losses
        policy_loss.backward()  # Backpropagate the loss
        self.optimizer.step()  # Update the model parameters

        # Clear stored actions and rewards after the update
        self.saved_actions = []
        self.rewards = []


In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Training parameters
num_episodes = 1000  # Number of episodes to train the policy for each trial
num_trials = 50  # Number of trials to run
gamma = 0.99  # Discount factor for future rewards

# Initialize storage for average steps across all trials
all_steps = np.zeros((num_trials, num_episodes))  # Array to store steps per episode for each trial

# Perform multiple trials
for trial in range(num_trials):
    env = ContinuousWindyGridworld()  # Initialize the environment (Continuous Windy Gridworld)
    policy = SGDPolicy()  # Initialize the policy (SGD-based policy)
    
    # Iterate over episodes for the current trial
    for episode in range(num_episodes):
        state = env.reset()  # Reset the environment to the start state
        done = False  # Flag to indicate if the episode is finished
        episode_steps = 0  # Counter for steps in this episode
        policy.rewards = []  # Reset rewards list for the episode
        
        # Run the episode until done
        while not done:
            action = policy.select_action(state)  # Get an action from the policy
            next_state, reward, done = env.step(action)  # Take the action in the environment
            policy.rewards.append(reward)  # Store the reward for the current step
            state = next_state  # Update state
            episode_steps += 1  # Increment step counter
            
            if done:  # If the episode ends, update the policy
                policy.update(gamma)
        
        all_steps[trial, episode] = episode_steps  # Store the number of steps taken for this episode

        # Print progress for the first trial every 100 episodes
        if (trial == 0) and ((episode + 1) % 100 == 0):  
            print(f"Trial {trial + 1}, Episode {episode + 1}/{num_episodes}, Steps: {episode_steps}")

# Compute average steps over all trials
avg_steps_per_episode = all_steps.mean(axis=0)  # Calculate mean steps per episode across trials

# Plot the results showing the average steps over episodes
plt.figure(figsize=(12, 6))
plt.plot(avg_steps_per_episode, label="Average Steps (50 Runs)")
plt.xlabel("Episode")
plt.ylabel("Average Steps to Goal")
plt.title("Policy Improvement Over Multiple Trials")
plt.legend()
plt.grid()
plt.show()

# Visualize the final policy path (trajectory to the goal)
state = env.reset()  # Reset the environment
path = [state]  # Initialize list to store path
done = False

# Follow the final policy path to the goal
while not done:
    action = policy.select_action(state)  # Select action using the trained policy
    next_state, _, done = env.step(action)  # Take action in the environment
    path.append(next_state)  # Store the state in the path
    state = next_state  # Update state

path = np.array(path)  # Convert path list to a numpy array
# Plot the trajectory of the final policy
plt.figure(figsize=(8, 8))
plt.plot(path[:, 0], path[:, 1], marker='o')  # Plot the path of the agent
plt.scatter(env.goal[0], env.goal[1], c='red', label='Goal')  # Plot the goal position
plt.xlim(env.x_min, env.x_max)  # Set X-axis limits
plt.ylim(env.y_min, env.y_max)  # Set Y-axis limits
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Final Policy Path")
plt.legend()
plt.grid()
plt.show()


Trial 1, Episode 100/1000, Steps: 408
Trial 1, Episode 200/1000, Steps: 868
Trial 1, Episode 300/1000, Steps: 184
Trial 1, Episode 400/1000, Steps: 58
Trial 1, Episode 500/1000, Steps: 14
Trial 1, Episode 600/1000, Steps: 92
Trial 1, Episode 700/1000, Steps: 77
Trial 1, Episode 800/1000, Steps: 287
Trial 1, Episode 900/1000, Steps: 929
Trial 1, Episode 1000/1000, Steps: 593


KeyboardInterrupt: 