In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%pip install gymnasium[mujoco]

Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m

In [3]:
import numpy as np
import gymnasium as gym
import mujoco
import torch
import torch.nn as nn
import torch.optim as optim
# Initialize the Walker2d environment
env = gym.make("Hopper-v4")



In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Simplified Value Network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_sizes=(64, 64), activation=nn.Tanh):
        super(ValueNetwork, self).__init__()
        self.activation = activation()
        self.layers = nn.ModuleList()
        
        # Hidden layers
        input_size = state_dim
        for hidden_size in hidden_sizes:
            self.layers.append(nn.Linear(input_size, hidden_size))
            input_size = hidden_size

        # Output layer
        self.output_layer = nn.Linear(input_size, 1)

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = self.activation(layer(x))
        value = self.output_layer(x)
        return value


# Simplified Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, discrete=True, hidden_sizes=(64, 64), activation=nn.Tanh):
        super(PolicyNetwork, self).__init__()
        self.discrete = discrete
        self.activation = activation()
        self.layers = nn.ModuleList()
        
        # Hidden layers
        input_size = state_dim
        for hidden_size in hidden_sizes:
            self.layers.append(nn.Linear(input_size, hidden_size))
            input_size = hidden_size

        # Output layer
        if self.discrete:
            # Discrete actions: output probabilities for each action
            self.output_layer = nn.Linear(input_size, action_dim)
        else:
            # Continuous actions: output mean and log_std for each action
            self.mean_layer = nn.Linear(input_size, action_dim)
            self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = self.activation(layer(x))
        
        if self.discrete:
            # Discrete actions: apply softmax for probabilities
            logits = self.output_layer(x)
            action_probs = F.softmax(logits, dim=-1)
            return action_probs
        else:
            # Continuous actions: return mean and std
            mean = self.mean_layer(x)
            std = torch.exp(self.log_std)
            return mean, std

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

class PPOAgent:
    def __init__(self, state_dim, action_dim, discrete, gamma=0.99, lam=0.95, eps_clip=0.2, lr=3e-4, k_epochs=4):
        self.gamma = gamma  # Discount factor
        self.lam = lam  # GAE lambda
        self.eps_clip = eps_clip  # Clipping epsilon
        self.k_epochs = k_epochs  # Number of PPO epochs
        
        self.policy_net = PolicyNetwork(state_dim, action_dim, discrete).to("cuda" if torch.cuda.is_available() else "cpu")
        self.value_net = ValueNetwork(state_dim).to("cuda" if torch.cuda.is_available() else "cpu")
        
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        if isinstance(self.policy_net, PolicyNetwork) and self.policy_net.discrete:
            action_probs = self.policy_net(state)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            return action.item(), action_dist.log_prob(action)
        else:
            mean, std = self.policy_net(state)
            action_dist = torch.distributions.Normal(mean, std)
            action = action_dist.sample()
            return action.cpu().numpy(), action_dist.log_prob(action).sum()

    def compute_advantages(self, rewards, values, dones):
        advantages = []
        advantage = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
            advantage = delta + self.gamma * self.lam * (1 - dones[t]) * advantage
            advantages.insert(0, advantage)
        return torch.tensor(advantages, dtype=torch.float32, device=self.device)

    def train(self, env, max_episodes=1000, rollout_steps=2048, batch_size=64):
        for episode in range(max_episodes):
            # Initialize trajectory variables
            states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
            
            # Reset environment and get the initial state
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            
            # Collect trajectories
            for _ in range(rollout_steps):
                with torch.no_grad():
                    value = self.value_net(state).squeeze(0)
                    action, log_prob = self.select_action(state.cpu().numpy())
                
                # Interact with the environment
                next_state, reward, done, truncated, _ = env.step(action)
                
                # Store trajectory data
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                dones.append(done or truncated)
                log_probs.append(log_prob)
                values.append(value)
                
                # Update the state
                state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                if done or truncated:
                    state, _ = env.reset()
                    state = torch.tensor(state, dtype=torch.float32).to(self.device)
            
            # Ensure valid trajectory data
            if len(states) == 0:
                print("No valid states collected; skipping this episode.")
                continue
            
            # Compute advantages and returns
            values.append(torch.tensor([0], device=self.device))  # Bootstrap value
            advantages = self.compute_advantages(rewards, values, dones)
            returns = advantages + torch.tensor(values[:-1], device=self.device)
            
            # Optimize policy and value networks
            states = torch.stack(states).to(self.device)
            actions = torch.tensor(np.array(actions), dtype=torch.float32 if not self.policy_net.discrete else torch.long).to(self.device)
            log_probs = torch.stack(log_probs).to(self.device)
        
            for _ in range(self.k_epochs):
                for i in range(0, rollout_steps, batch_size):
                    batch_states = states[i:i+batch_size]
                    batch_actions = actions[i:i+batch_size]
                    batch_log_probs = log_probs[i:i+batch_size]
                    batch_advantages = advantages[i:i+batch_size]
                    batch_returns = returns[i:i+batch_size]
    
                    # Policy update
                    if self.policy_net.discrete:
                        # Discrete action space
                        action_probs = self.policy_net(batch_states)
                        dist = torch.distributions.Categorical(action_probs)
                        new_log_probs = dist.log_prob(batch_actions)
                    else:
                        # Continuous action space
                        mean, std = self.policy_net(batch_states)
                        dist = torch.distributions.Normal(mean, std)
                        new_log_probs = dist.log_prob(batch_actions).sum(dim=-1)
    
                    # PPO objective
                    ratio = torch.exp(new_log_probs - batch_log_probs)
                    surr1 = ratio * batch_advantages
                    surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                    policy_loss = -torch.min(surr1, surr2).mean()

                    # Value update
                    value_preds = self.value_net(batch_states).squeeze(-1)
                    value_loss = nn.MSELoss()(value_preds, batch_returns)
    
                    # Backpropagation
                    self.policy_optimizer.zero_grad()
                    self.value_optimizer.zero_grad()
                    (policy_loss + 0.5 * value_loss).backward()
                    self.policy_optimizer.step()
                    self.value_optimizer.step()
    
            print(f"Episode {episode + 1}: Policy Loss = {policy_loss.item()}, Value Loss = {value_loss.item()}")

        
       


In [6]:
if __name__ == "__main__":
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    discrete = False  # Set to True if action space is discrete
    
    VanillaAgent = PPOAgent(state_dim, action_dim, discrete)
    VanillaAgent.train(env, max_episodes=100)


Episode 1: Policy Loss = -3.828803062438965, Value Loss = 16.592451095581055
Episode 2: Policy Loss = -1.9623545408248901, Value Loss = 9.722156524658203
Episode 3: Policy Loss = -23.57123565673828, Value Loss = 429.6669921875
Episode 4: Policy Loss = -29.276615142822266, Value Loss = 732.6014404296875
Episode 5: Policy Loss = -8.263506889343262, Value Loss = 99.85221862792969
Episode 6: Policy Loss = -8.829984664916992, Value Loss = 132.50885009765625
Episode 7: Policy Loss = -12.414569854736328, Value Loss = 257.19964599609375
Episode 8: Policy Loss = -6.319770812988281, Value Loss = 138.45005798339844
Episode 9: Policy Loss = -19.753684997558594, Value Loss = 469.4028625488281
Episode 10: Policy Loss = -5.664707183837891, Value Loss = 213.6244354248047
Episode 11: Policy Loss = -6.490495681762695, Value Loss = 338.0513000488281
Episode 12: Policy Loss = -12.929885864257812, Value Loss = 609.2012939453125
Episode 13: Policy Loss = -17.888696670532227, Value Loss = 550.2659912109375
E

In [7]:
def evaluate_agent(env, policy_net, num_episodes=200, render=False):
    """
    Evaluates the trained policy network on the environment.

    Args:
    - env: The Gym environment.
    - policy_net: The trained policy network.
    - num_episodes: Number of episodes to evaluate.
    - render: Whether to render the environment during evaluation.

    Returns:
    - Average reward over the evaluated episodes.
    """
    device = next(policy_net.parameters()).device
    total_rewards = []

    for episode in range(num_episodes):
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            state = reset_result[0]
        else:
            state = reset_result

        if not isinstance(state, np.ndarray):
            state = np.array(state, dtype=np.float32)

        episode_reward = 0
        terminated, truncated = False, False

        while not (terminated or truncated):
            if render:
                env.render()

            # Move the state tensor to the same device as the policy network
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

            with torch.no_grad():
                policy_output = policy_net(state_tensor)

                if isinstance(policy_output, tuple):
                    # Assuming (mean, std) for continuous action spaces
                    action_mean, action_std = policy_output
                    action = torch.normal(action_mean, action_std).cpu().numpy()
                else:
                    # Assuming logits for discrete action spaces
                    action_prob = torch.softmax(policy_output, dim=-1)
                    action = torch.argmax(action_prob, dim=-1).cpu().numpy()

            # Squeeze the action to ensure proper shape
            action = action.squeeze()

            # Take a step in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, terminated, truncated, _ = next_step_result
            else:
                next_state, reward, terminated, truncated = next_step_result[:4]

            episode_reward += reward
            state = next_state

        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = sum(total_rewards) / num_episodes
    print(f"Average Reward over {num_episodes} Episodes: {average_reward}")
    return average_reward


In [8]:
# Initialize the environment
env = gym.make("Hopper-v4")

# Evaluate the agent using the trained policy network
average_reward = evaluate_agent(env, VanillaAgent.policy_net, num_episodes=200)


Episode 1: Reward = 600.2398146535725
Episode 2: Reward = 341.52519992374107
Episode 3: Reward = 409.0947660692718
Episode 4: Reward = 551.5519040029926
Episode 5: Reward = 552.576896863094
Episode 6: Reward = 588.5949767764788
Episode 7: Reward = 381.61276213358894
Episode 8: Reward = 407.5234209859817
Episode 9: Reward = 372.27349274294085
Episode 10: Reward = 536.6862008014074
Episode 11: Reward = 425.61479161241124
Episode 12: Reward = 415.3001253288772
Episode 13: Reward = 566.4374495535066
Episode 14: Reward = 671.0655144897112
Episode 15: Reward = 558.2680156118614
Episode 16: Reward = 370.2221960029768
Episode 17: Reward = 476.19752972554863
Episode 18: Reward = 481.17654825362126
Episode 19: Reward = 437.986067454201
Episode 20: Reward = 410.22141879605755
Episode 21: Reward = 531.8176816101375
Episode 22: Reward = 577.5455386843926
Episode 23: Reward = 667.7627800232236
Episode 24: Reward = 521.9555006199785
Episode 25: Reward = 550.2605808833861
Episode 26: Reward = 394.1979

In [9]:
def apply_perturbation(attack_method, state, params, policy_model=None, sarsa_model=None):
    """
    Apply perturbation to the state based on the attack method.

    Args:
        attack_method (str): The type of attack ('robust_sarsa', 'mad', 'random').
        state (torch.Tensor): The current state tensor.
        params (object): Parameters for the attack (e.g., epsilon, steps, etc.).
        policy_model (nn.Module): The policy model (for MAD and Sarsa+MAD).
        sarsa_model (nn.Module): The Sarsa model (for Robust Sarsa).

    Returns:
        torch.Tensor: The perturbed state.
    """
    eps = params.get("epsilon", 0.1)
    steps = params.get("steps", 10)
    step_eps = eps / steps
    clamp_min = state - eps
    clamp_max = state + eps

    if attack_method == "robust_sarsa":
        assert sarsa_model is not None, "Sarsa model is required for Robust Sarsa attack."
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            actions = policy_model(perturbed_state)[0]  # Assuming policy returns action logits
            value = sarsa_model(torch.cat((state, actions), dim=1)).mean(dim=1)
            value.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state - update, clamp_min), clamp_max)
            sarsa_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "mad":
        assert policy_model is not None, "Policy model is required for MAD attack."
        original_action = policy_model(state)[0].detach()
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            new_action = policy_model(perturbed_state)[0]
            action_diff = ((new_action - original_action) ** 2).sum(dim=1)
            action_diff.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state + update, clamp_min), clamp_max)
            policy_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "random":
        noise = torch.empty_like(state).uniform_(-eps, eps)
        return (state + noise).detach()

    else:
        raise ValueError(f"Unknown attack method: {attack_method}")

In [10]:
attack_params = {
    "epsilon": 0.1,  # Maximum perturbation magnitude
    "steps": 5,      # Number of iterative steps
}

In [11]:
import torch
import numpy as np
import gymnasium as gym

def random_perturbation(state, epsilon):
    """
    Apply random perturbation to the state.
    Args:
        state: The original state.
        epsilon: The maximum magnitude of random noise.
    Returns:
        Perturbed state.
    """
    noise = np.random.uniform(-epsilon, epsilon, size=state.shape)
    perturbed_state = state + noise
    return perturbed_state

def evaluate_agent_with_random_attack(env, policy_net, epsilon=5, num_episodes=200):
    """
    Evaluate the agent with random perturbation applied to states during testing.
    Args:
        env: The environment to test the agent.
        policy_net: The trained policy network.
        epsilon: Maximum magnitude of random noise for perturbation.
        num_episodes: Number of episodes for evaluation.
    Returns:
        Average reward over the episodes.
    """
    # Ensure policy network is on the same device as input tensors
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net.to(device)
    policy_net.eval()  # Set the network to evaluation mode

    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):  # Handle Gymnasium's (observation, info) format
            state = state[0]
        episode_reward = 0
        done = False

        while not done:
            # Apply random perturbation to the state
            perturbed_state = random_perturbation(state, epsilon)

            # Convert perturbed state to tensor and send to the same device as the policy network
            state_tensor = torch.tensor(perturbed_state, dtype=torch.float32, device=device).unsqueeze(0)

            # Get action from the policy network
            with torch.no_grad():
                policy_output = policy_net(state_tensor)
                # If the policy network outputs a tuple, extract the action vector
                if isinstance(policy_output, tuple):
                    action = policy_output[0].cpu().numpy().squeeze()
                else:
                    action = policy_output.cpu().numpy().squeeze()

            # Take the action in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            episode_reward += reward
            state = next_state

        total_reward += episode_reward
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

# Example usage
env = gym.make("Hopper-v4")
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

epsilon = 0.1  # Maximum perturbation magnitude
evaluate_agent_with_random_attack(env, policy_net, epsilon)


Episode 1: Reward = 643.2179596799366
Episode 2: Reward = 618.1933928958904
Episode 3: Reward = 841.4080132820873
Episode 4: Reward = 803.9976001779669
Episode 5: Reward = 832.6988657782032
Episode 6: Reward = 614.9587093459965
Episode 7: Reward = 780.4425837788779
Episode 8: Reward = 681.2243236692553
Episode 9: Reward = 816.6033682256289
Episode 10: Reward = 782.6961444127606
Episode 11: Reward = 622.0367075702169
Episode 12: Reward = 652.2318490121122
Episode 13: Reward = 673.8185128354237
Episode 14: Reward = 808.3609056900214
Episode 15: Reward = 809.9699393608144
Episode 16: Reward = 676.4921687786939
Episode 17: Reward = 616.2496717034784
Episode 18: Reward = 621.1462726134849
Episode 19: Reward = 660.9665110276775
Episode 20: Reward = 615.5766437813744
Episode 21: Reward = 794.578690604192
Episode 22: Reward = 694.1522397845266
Episode 23: Reward = 686.5307736542325
Episode 24: Reward = 806.076180772443
Episode 25: Reward = 689.3570309546707
Episode 26: Reward = 787.30166171688

732.4191947937323

In [12]:
import torch
import numpy as np


def evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01, beta=1.0):
    """
    Evaluate the agent under a MAD (Maximizing Action Discrepancy) attack for continuous action spaces.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.
        beta (float): Inverse temperature parameter for SGLD noise.

    Returns:
        float: Average reward over the episodes under MAD attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Start with the original state
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Compute the policy outputs for original and perturbed states
                original_mean, original_std = policy_net(state.unsqueeze(0))
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))

                # Compute KL divergence between original and perturbed distributions
                loss = -torch.distributions.kl.kl_divergence(
                    torch.distributions.Normal(original_mean, original_std),
                    torch.distributions.Normal(perturbed_mean, perturbed_std)
                ).mean()
                loss.backward()

                # Compute gradient and add noise for SGLD
                grad = perturbed_state.grad
                noise = torch.randn_like(perturbed_state) * torch.sqrt(torch.tensor(2 / (beta * step_epsilon), device=perturbed_state.device))
                perturbation = step_epsilon * grad + noise

                # Update the perturbed state
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state to within the epsilon-ball
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))
                action_dist = torch.distributions.Normal(perturbed_mean, perturbed_std)
                action = action_dist.sample().squeeze().cpu().numpy()  # Match expected shape (e.g., (3,) for continuous action)

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under MAD attack: {average_reward}")
    return average_reward


In [13]:
# Example usage
env = gym.make("Hopper-v4")

# Initialize the trained policy network
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Evaluate the policy under MAD attack
average_reward = evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=attack_steps, step_epsilon=step_epsilon)
print(f"Final Average Reward under MAD Attack: {average_reward}")


Episode 1/200: Reward = 490.76059256410656
Episode 2/200: Reward = 496.86761995695855
Episode 3/200: Reward = 429.37915979316773
Episode 4/200: Reward = 459.09612971450804
Episode 5/200: Reward = 367.54676269930314
Episode 6/200: Reward = 388.07699477480634
Episode 7/200: Reward = 338.32899818937847
Episode 8/200: Reward = 589.12682051278
Episode 9/200: Reward = 411.90132819399577
Episode 10/200: Reward = 373.74116294645074
Episode 11/200: Reward = 284.2926478503728
Episode 12/200: Reward = 504.4106117489463
Episode 13/200: Reward = 432.28539027170746
Episode 14/200: Reward = 497.0653769091538
Episode 15/200: Reward = 467.2525191339566
Episode 16/200: Reward = 358.23119336553907
Episode 17/200: Reward = 343.82899772499235
Episode 18/200: Reward = 358.57689000938353
Episode 19/200: Reward = 568.3745288485321
Episode 20/200: Reward = 511.613030794185
Episode 21/200: Reward = 529.1848236726398
Episode 22/200: Reward = 323.27726996684015
Episode 23/200: Reward = 365.1999875101244
Episode 2

In [14]:
import random

def robust_sarsa_attack(env, policy_net, epsilon_schedule, num_steps=10000, lambda_rs=0.1, batch_size=64, gamma=0.99):
    """
    Train a robust value function for a policy under attack using Robust Sarsa.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon_schedule (list): Schedule for perturbation magnitudes.
        num_steps (int): Number of training steps.
        lambda_rs (float): Regularization parameter for the robust objective.
        batch_size (int): Number of transitions sampled per update.
        gamma (float): Discount factor.

    Returns:
        torch.nn.Module: The robust Q-value network.
    """
    device = next(policy_net.parameters()).device

    # Detect action space type
    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n  # Discrete action space
    elif isinstance(env.action_space, gym.spaces.Box):
        action_dim = env.action_space.shape[0]  # Continuous action space
    else:
        raise ValueError("Unsupported action space type. Only Discrete and Box spaces are supported.")

    # Initialize Q-function (robust critic) as a neural network
    q_net = torch.nn.Sequential(
        torch.nn.Linear(env.observation_space.shape[0] + action_dim, 128),
        torch.nn.ReLU(),
        torch.nn.Linear(128, 1)  # Single Q-value output
    ).to(device)

    optimizer = torch.optim.Adam(q_net.parameters(), lr=1e-3)

    # Replay buffer
    replay_buffer = []

    def collect_trajectory():
        """Collect one trajectory and add to the replay buffer."""
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if necessary
        state = torch.tensor(state, dtype=torch.float32).to(device)
    
        done = False
        while not done:
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, pass only the state to Q-network
                    q_values = torch.cat([q_net(torch.cat([state, torch.eye(action_dim)[a].to(device)], dim=0))
                                          for a in range(action_dim)])
                    action = torch.argmax(q_values).item()
                else:
                    # For continuous actions, extract mean from policy network
                    policy_output = policy_net(state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output  # Extract mean and ignore std
                    else:
                        mean = policy_output  # If single output, it's the mean
                    action = mean.squeeze().cpu().numpy()  # Convert to NumPy
    
            # Step the environment
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated  # Combine termination conditions
            next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
    
            replay_buffer.append((state, action, reward, next_state, done))
    
            if len(replay_buffer) > 10000:
                replay_buffer.pop(0)
    
            state = next_state

    for step in range(num_steps):
        # Collect new trajectories periodically
        if len(replay_buffer) < batch_size or step % 10 == 0:
            collect_trajectory()

        # Ensure the buffer has enough samples for a batch
        if len(replay_buffer) < batch_size:
            continue  # Skip training step until buffer has enough data

        # Sample batch
        batch = random.sample(replay_buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.stack(states).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.stack(next_states).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        # Prepare inputs for Q-network
        if isinstance(env.action_space, gym.spaces.Discrete):
            actions = torch.tensor(actions, dtype=torch.int64).to(device)  # Discrete actions
            state_action_pairs = torch.cat([states, torch.eye(action_dim).to(device)[actions]], dim=1)
            next_state_action_pairs = torch.cat([next_states, torch.eye(action_dim).to(device)], dim=1)
        else:
            actions = torch.tensor(actions, dtype=torch.float32).to(device)  # Continuous actions
            state_action_pairs = torch.cat([states, actions], dim=1)
            next_state_action_pairs = torch.cat([next_states, actions], dim=1)

        # Temporal Difference Loss
        q_values = q_net(state_action_pairs).squeeze()
        q_values_next = q_net(next_state_action_pairs).squeeze()
        td_loss = (rewards + gamma * (1 - dones) * q_values_next - q_values).pow(2).mean()

        # Robustness Loss
        epsilon = epsilon_schedule[min(step, len(epsilon_schedule) - 1)]
        robust_loss = 0
        for i in range(batch_size):
            perturbation = (torch.rand_like(states[i]) * 2 - 1) * epsilon
            perturbed_state = states[i] + perturbation
            perturbed_state_action = torch.cat([perturbed_state, actions[i]], dim=0)
            robust_loss += (q_net(perturbed_state_action.unsqueeze(0)) - q_values[i]).pow(2).mean()
        robust_loss /= batch_size

        # Total Loss
        total_loss = td_loss + lambda_rs * robust_loss

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Print progress
        if step % 100 == 0:
            print(f"Step {step}/{num_steps}, TD Loss: {td_loss.item():.4f}, Robust Loss: {robust_loss.item():.4f}")

    return q_net


In [15]:
def evaluate_agent_with_robust_sarsa_attack(env, policy_net, robust_q_net, epsilon, step_size, num_episodes=100, attack_steps=10):
    """
    Evaluate the agent under a Robust Sarsa Critic-based attack.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        robust_q_net (torch.nn.Module): The robust Q-value network trained with Robust Sarsa.
        epsilon (float): Maximum perturbation magnitude for the attack.
        step_size (float): Step size for the gradient update.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of attack steps (K in the pseudocode).

    Returns:
        float: Average reward over the episodes under Robust Sarsa Critic-based attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Initialize the perturbed state
            perturbed_state = state.clone().detach().requires_grad_(True)

            # Perform the attack as per Algorithm 2
            for _ in range(attack_steps):
                # Forward pass through the policy to get the action
                with torch.no_grad():
                    if isinstance(env.action_space, gym.spaces.Discrete):
                        action_probs = policy_net(perturbed_state.unsqueeze(0))
                        action = torch.argmax(action_probs, dim=-1)
                    else:
                        policy_output = policy_net(perturbed_state.unsqueeze(0))
                        if isinstance(policy_output, tuple):
                            mean, _ = policy_output  # Extract mean and ignore std
                        else:
                            mean = policy_output
                        action = mean.squeeze()

                # Compute Q(s, a) for the critic
                state_action = torch.cat([perturbed_state, action.float().to(device)]) if isinstance(env.action_space, gym.spaces.Box) else \
                               torch.cat([perturbed_state, torch.eye(env.action_space.n)[action].to(device)], dim=0)
                q_value = robust_q_net(state_action.unsqueeze(0))

                # Backpropagate the gradient
                q_value.backward()
                grad = perturbed_state.grad

                # Update the perturbed state based on the gradient and step size
                perturbed_state = perturbed_state - step_size * grad.sign()
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)  # Clamp to the epsilon-ball

            # Use the adversarially perturbed state to select the final action
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    action_probs = policy_net(perturbed_state.unsqueeze(0))
                    action = torch.argmax(action_probs, dim=-1).item()
                else:
                    policy_output = policy_net(perturbed_state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output
                    else:
                        mean = policy_output
                    action = mean.squeeze().cpu().numpy()

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under Robust Sarsa Critic-based attack: {average_reward}")
    return average_reward


In [16]:
# Example usage
env = gym.make("Hopper-v4")

# Initialize the trained policy network
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step
epsilon_schedule = [0.01 * i for i in range(1, 101)]
# Evaluate the policy under MAD attack

robust_q_net=robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    epsilon_schedule=epsilon_schedule,
    num_steps=5000,        # Number of training steps
    lambda_rs=0.1,         # Regularization parameter for robust loss
    batch_size=64,         # Batch size for training
    gamma=0.99             # Discount factor
)


  actions = torch.tensor(actions, dtype=torch.float32).to(device)  # Continuous actions


Step 0/5000, TD Loss: 10.2370, Robust Loss: 0.0000
Step 100/5000, TD Loss: 9.6587, Robust Loss: 0.1440
Step 200/5000, TD Loss: 11.1174, Robust Loss: 0.5529
Step 300/5000, TD Loss: 10.4458, Robust Loss: 0.9164
Step 400/5000, TD Loss: 8.8024, Robust Loss: 1.7482
Step 500/5000, TD Loss: 7.7610, Robust Loss: 2.3427
Step 600/5000, TD Loss: 8.8608, Robust Loss: 2.0869
Step 700/5000, TD Loss: 8.5722, Robust Loss: 3.4817
Step 800/5000, TD Loss: 6.7933, Robust Loss: 3.5360
Step 900/5000, TD Loss: 7.3825, Robust Loss: 5.0513
Step 1000/5000, TD Loss: 5.7984, Robust Loss: 5.9390
Step 1100/5000, TD Loss: 6.9415, Robust Loss: 6.2464
Step 1200/5000, TD Loss: 5.1660, Robust Loss: 7.2570
Step 1300/5000, TD Loss: 6.2220, Robust Loss: 10.2141
Step 1400/5000, TD Loss: 5.3497, Robust Loss: 9.2060
Step 1500/5000, TD Loss: 5.4343, Robust Loss: 8.2461
Step 1600/5000, TD Loss: 6.5990, Robust Loss: 10.2942
Step 1700/5000, TD Loss: 4.8554, Robust Loss: 9.1940
Step 1800/5000, TD Loss: 4.1388, Robust Loss: 13.4062

In [17]:
average_reward = average_reward = evaluate_agent_with_robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    robust_q_net=robust_q_net,
    epsilon=0.05,
    num_episodes=200,
    step_size=0.01
)
print(f"Final Average Reward under Robust Sarsa Attack: {average_reward}")


Episode 1/200: Reward = 707.6355391862724
Episode 2/200: Reward = 782.3928646176172
Episode 3/200: Reward = 730.4273015490278
Episode 4/200: Reward = 792.0566380946177
Episode 5/200: Reward = 795.3140327485272
Episode 6/200: Reward = 757.1318450993338
Episode 7/200: Reward = 732.0321066416775
Episode 8/200: Reward = 751.6878242035779
Episode 9/200: Reward = 762.0241656887515
Episode 10/200: Reward = 725.4236209713097
Episode 11/200: Reward = 723.6604742676035
Episode 12/200: Reward = 789.0231857055218
Episode 13/200: Reward = 765.1176826890908
Episode 14/200: Reward = 757.3359642031894
Episode 15/200: Reward = 769.0141742093816
Episode 16/200: Reward = 799.6117915357756
Episode 17/200: Reward = 777.4660001811559
Episode 18/200: Reward = 792.4818356604893
Episode 19/200: Reward = 792.3570785940767
Episode 20/200: Reward = 786.5296506815274
Episode 21/200: Reward = 778.4934676414505
Episode 22/200: Reward = 797.7870627998079
Episode 23/200: Reward = 778.0045903774476
Episode 24/200: Rewa

In [19]:
if __name__ == "__main__":
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    discrete = False  # Set to True if action space is discrete
    
    RobustAgent = SAPPOAgent(state_dim, action_dim, discrete)
    RobustAgent.train(env, max_episodes=100)


Episode 1: Policy Loss = -8.539791107177734, Value Loss = 91.50924682617188, KL Reg = 1.0124143955181353e-05
Episode 2: Policy Loss = -14.921133041381836, Value Loss = 221.65118408203125, KL Reg = 1.2055054867232684e-05
Episode 3: Policy Loss = -4.782430648803711, Value Loss = 120.10716247558594, KL Reg = 1.5467019693460315e-05
Episode 4: Policy Loss = -6.200763702392578, Value Loss = 64.98123168945312, KL Reg = 2.2710195480613038e-05
Episode 5: Policy Loss = -9.923398971557617, Value Loss = 157.38397216796875, KL Reg = 3.756040314328857e-05
Episode 6: Policy Loss = -10.554612159729004, Value Loss = 216.71499633789062, KL Reg = 5.482680717250332e-05
Episode 7: Policy Loss = -10.816160202026367, Value Loss = 239.9801025390625, KL Reg = 7.194499630713835e-05
Episode 8: Policy Loss = -8.576520919799805, Value Loss = 209.21029663085938, KL Reg = 9.797522943699732e-05
Episode 9: Policy Loss = -5.71106481552124, Value Loss = 173.89205932617188, KL Reg = 0.0001289803913095966
Episode 10: Poli