In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%pip install gymnasium[mujoco]

Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m

In [3]:
import numpy as np
import gymnasium as gym
import mujoco
import torch
import torch.nn as nn
import torch.optim as optim
# Initialize the Walker2d environment
env = gym.make("Hopper-v4")



In [7]:
def evaluate_agent(env, policy_net, num_episodes=200, render=False):
    """
    Evaluates the trained policy network on the environment.

    Args:
    - env: The Gym environment.
    - policy_net: The trained policy network.
    - num_episodes: Number of episodes to evaluate.
    - render: Whether to render the environment during evaluation.

    Returns:
    - Average reward over the evaluated episodes.
    """
    device = next(policy_net.parameters()).device
    total_rewards = []

    for episode in range(num_episodes):
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            state = reset_result[0]
        else:
            state = reset_result

        if not isinstance(state, np.ndarray):
            state = np.array(state, dtype=np.float32)

        episode_reward = 0
        terminated, truncated = False, False

        while not (terminated or truncated):
            if render:
                env.render()

            # Move the state tensor to the same device as the policy network
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

            with torch.no_grad():
                policy_output = policy_net(state_tensor)

                if isinstance(policy_output, tuple):
                    # Assuming (mean, std) for continuous action spaces
                    action_mean, action_std = policy_output
                    action = torch.normal(action_mean, action_std).cpu().numpy()
                else:
                    # Assuming logits for discrete action spaces
                    action_prob = torch.softmax(policy_output, dim=-1)
                    action = torch.argmax(action_prob, dim=-1).cpu().numpy()

            # Squeeze the action to ensure proper shape
            action = action.squeeze()

            # Take a step in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, terminated, truncated, _ = next_step_result
            else:
                next_state, reward, terminated, truncated = next_step_result[:4]

            episode_reward += reward
            state = next_state

        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = sum(total_rewards) / num_episodes
    print(f"Average Reward over {num_episodes} Episodes: {average_reward}")
    return average_reward


In [9]:
def apply_perturbation(attack_method, state, params, policy_model=None, sarsa_model=None):
    """
    Apply perturbation to the state based on the attack method.

    Args:
        attack_method (str): The type of attack ('robust_sarsa', 'mad', 'random').
        state (torch.Tensor): The current state tensor.
        params (object): Parameters for the attack (e.g., epsilon, steps, etc.).
        policy_model (nn.Module): The policy model (for MAD and Sarsa+MAD).
        sarsa_model (nn.Module): The Sarsa model (for Robust Sarsa).

    Returns:
        torch.Tensor: The perturbed state.
    """
    eps = params.get("epsilon", 0.1)
    steps = params.get("steps", 10)
    step_eps = eps / steps
    clamp_min = state - eps
    clamp_max = state + eps

    if attack_method == "robust_sarsa":
        assert sarsa_model is not None, "Sarsa model is required for Robust Sarsa attack."
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            actions = policy_model(perturbed_state)[0]  # Assuming policy returns action logits
            value = sarsa_model(torch.cat((state, actions), dim=1)).mean(dim=1)
            value.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state - update, clamp_min), clamp_max)
            sarsa_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "mad":
        assert policy_model is not None, "Policy model is required for MAD attack."
        original_action = policy_model(state)[0].detach()
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            new_action = policy_model(perturbed_state)[0]
            action_diff = ((new_action - original_action) ** 2).sum(dim=1)
            action_diff.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state + update, clamp_min), clamp_max)
            policy_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "random":
        noise = torch.empty_like(state).uniform_(-eps, eps)
        return (state + noise).detach()

    else:
        raise ValueError(f"Unknown attack method: {attack_method}")

In [10]:
attack_params = {
    "epsilon": 0.1,  # Maximum perturbation magnitude
    "steps": 5,      # Number of iterative steps
}

In [12]:
import torch
import numpy as np


def evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01, beta=1.0):
    """
    Evaluate the agent under a MAD (Maximizing Action Discrepancy) attack for continuous action spaces.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.
        beta (float): Inverse temperature parameter for SGLD noise.

    Returns:
        float: Average reward over the episodes under MAD attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Start with the original state
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Compute the policy outputs for original and perturbed states
                original_mean, original_std = policy_net(state.unsqueeze(0))
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))

                # Compute KL divergence between original and perturbed distributions
                loss = -torch.distributions.kl.kl_divergence(
                    torch.distributions.Normal(original_mean, original_std),
                    torch.distributions.Normal(perturbed_mean, perturbed_std)
                ).mean()
                loss.backward()

                # Compute gradient and add noise for SGLD
                grad = perturbed_state.grad
                noise = torch.randn_like(perturbed_state) * torch.sqrt(torch.tensor(2 / (beta * step_epsilon), device=perturbed_state.device))
                perturbation = step_epsilon * grad + noise

                # Update the perturbed state
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state to within the epsilon-ball
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))
                action_dist = torch.distributions.Normal(perturbed_mean, perturbed_std)
                action = action_dist.sample().squeeze().cpu().numpy()  # Match expected shape (e.g., (3,) for continuous action)

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under MAD attack: {average_reward}")
    return average_reward


In [14]:
import random

def robust_sarsa_attack(env, policy_net, epsilon_schedule, num_steps=10000, lambda_rs=0.1, batch_size=64, gamma=0.99):
    """
    Train a robust value function for a policy under attack using Robust Sarsa.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon_schedule (list): Schedule for perturbation magnitudes.
        num_steps (int): Number of training steps.
        lambda_rs (float): Regularization parameter for the robust objective.
        batch_size (int): Number of transitions sampled per update.
        gamma (float): Discount factor.

    Returns:
        torch.nn.Module: The robust Q-value network.
    """
    device = next(policy_net.parameters()).device

    # Detect action space type
    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n  # Discrete action space
    elif isinstance(env.action_space, gym.spaces.Box):
        action_dim = env.action_space.shape[0]  # Continuous action space
    else:
        raise ValueError("Unsupported action space type. Only Discrete and Box spaces are supported.")

    # Initialize Q-function (robust critic) as a neural network
    q_net = torch.nn.Sequential(
        torch.nn.Linear(env.observation_space.shape[0] + action_dim, 128),
        torch.nn.ReLU(),
        torch.nn.Linear(128, 1)  # Single Q-value output
    ).to(device)

    optimizer = torch.optim.Adam(q_net.parameters(), lr=1e-3)

    # Replay buffer
    replay_buffer = []

    def collect_trajectory():
        """Collect one trajectory and add to the replay buffer."""
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if necessary
        state = torch.tensor(state, dtype=torch.float32).to(device)
    
        done = False
        while not done:
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, pass only the state to Q-network
                    q_values = torch.cat([q_net(torch.cat([state, torch.eye(action_dim)[a].to(device)], dim=0))
                                          for a in range(action_dim)])
                    action = torch.argmax(q_values).item()
                else:
                    # For continuous actions, extract mean from policy network
                    policy_output = policy_net(state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output  # Extract mean and ignore std
                    else:
                        mean = policy_output  # If single output, it's the mean
                    action = mean.squeeze().cpu().numpy()  # Convert to NumPy
    
            # Step the environment
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated  # Combine termination conditions
            next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
    
            replay_buffer.append((state, action, reward, next_state, done))
    
            if len(replay_buffer) > 10000:
                replay_buffer.pop(0)
    
            state = next_state

    for step in range(num_steps):
        # Collect new trajectories periodically
        if len(replay_buffer) < batch_size or step % 10 == 0:
            collect_trajectory()

        # Ensure the buffer has enough samples for a batch
        if len(replay_buffer) < batch_size:
            continue  # Skip training step until buffer has enough data

        # Sample batch
        batch = random.sample(replay_buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.stack(states).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.stack(next_states).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        # Prepare inputs for Q-network
        if isinstance(env.action_space, gym.spaces.Discrete):
            actions = torch.tensor(actions, dtype=torch.int64).to(device)  # Discrete actions
            state_action_pairs = torch.cat([states, torch.eye(action_dim).to(device)[actions]], dim=1)
            next_state_action_pairs = torch.cat([next_states, torch.eye(action_dim).to(device)], dim=1)
        else:
            actions = torch.tensor(actions, dtype=torch.float32).to(device)  # Continuous actions
            state_action_pairs = torch.cat([states, actions], dim=1)
            next_state_action_pairs = torch.cat([next_states, actions], dim=1)

        # Temporal Difference Loss
        q_values = q_net(state_action_pairs).squeeze()
        q_values_next = q_net(next_state_action_pairs).squeeze()
        td_loss = (rewards + gamma * (1 - dones) * q_values_next - q_values).pow(2).mean()

        # Robustness Loss
        epsilon = epsilon_schedule[min(step, len(epsilon_schedule) - 1)]
        robust_loss = 0
        for i in range(batch_size):
            perturbation = (torch.rand_like(states[i]) * 2 - 1) * epsilon
            perturbed_state = states[i] + perturbation
            perturbed_state_action = torch.cat([perturbed_state, actions[i]], dim=0)
            robust_loss += (q_net(perturbed_state_action.unsqueeze(0)) - q_values[i]).pow(2).mean()
        robust_loss /= batch_size

        # Total Loss
        total_loss = td_loss + lambda_rs * robust_loss

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Print progress
        if step % 100 == 0:
            print(f"Step {step}/{num_steps}, TD Loss: {td_loss.item():.4f}, Robust Loss: {robust_loss.item():.4f}")

    return q_net


In [15]:
def evaluate_agent_with_robust_sarsa_attack(env, policy_net, robust_q_net, epsilon, step_size, num_episodes=100, attack_steps=10):
    """
    Evaluate the agent under a Robust Sarsa Critic-based attack.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        robust_q_net (torch.nn.Module): The robust Q-value network trained with Robust Sarsa.
        epsilon (float): Maximum perturbation magnitude for the attack.
        step_size (float): Step size for the gradient update.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of attack steps (K in the pseudocode).

    Returns:
        float: Average reward over the episodes under Robust Sarsa Critic-based attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Initialize the perturbed state
            perturbed_state = state.clone().detach().requires_grad_(True)

            # Perform the attack as per Algorithm 2
            for _ in range(attack_steps):
                # Forward pass through the policy to get the action
                with torch.no_grad():
                    if isinstance(env.action_space, gym.spaces.Discrete):
                        action_probs = policy_net(perturbed_state.unsqueeze(0))
                        action = torch.argmax(action_probs, dim=-1)
                    else:
                        policy_output = policy_net(perturbed_state.unsqueeze(0))
                        if isinstance(policy_output, tuple):
                            mean, _ = policy_output  # Extract mean and ignore std
                        else:
                            mean = policy_output
                        action = mean.squeeze()

                # Compute Q(s, a) for the critic
                state_action = torch.cat([perturbed_state, action.float().to(device)]) if isinstance(env.action_space, gym.spaces.Box) else \
                               torch.cat([perturbed_state, torch.eye(env.action_space.n)[action].to(device)], dim=0)
                q_value = robust_q_net(state_action.unsqueeze(0))

                # Backpropagate the gradient
                q_value.backward()
                grad = perturbed_state.grad

                # Update the perturbed state based on the gradient and step size
                perturbed_state = perturbed_state - step_size * grad.sign()
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)  # Clamp to the epsilon-ball

            # Use the adversarially perturbed state to select the final action
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    action_probs = policy_net(perturbed_state.unsqueeze(0))
                    action = torch.argmax(action_probs, dim=-1).item()
                else:
                    policy_output = policy_net(perturbed_state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output
                    else:
                        mean = policy_output
                    action = mean.squeeze().cpu().numpy()

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under Robust Sarsa Critic-based attack: {average_reward}")
    return average_reward


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as dist
import numpy as np
import gymnasium as gym


class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, discrete=True, hidden_sizes=(64, 64)):
        super(PolicyNetwork, self).__init__()
        self.discrete = discrete
        self.layers = nn.ModuleList()
        input_dim = state_dim

        for hidden_dim in hidden_sizes:
            self.layers.append(nn.Linear(input_dim, hidden_dim))
            input_dim = hidden_dim

        if self.discrete:
            self.output = nn.Linear(input_dim, action_dim)
        else:
            self.mean = nn.Linear(input_dim, action_dim)
            self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = torch.tanh(layer(x))
        if self.discrete:
            logits = self.output(x)
            return torch.softmax(logits, dim=-1)
        else:
            mean = self.mean(x)
            std = torch.exp(self.log_std)
            return mean, std


class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_sizes=(64, 64)):
        super(ValueNetwork, self).__init__()
        self.layers = nn.ModuleList()
        input_dim = state_dim

        for hidden_dim in hidden_sizes:
            self.layers.append(nn.Linear(input_dim, hidden_dim))
            input_dim = hidden_dim

        self.output = nn.Linear(input_dim, 1)

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = torch.tanh(layer(x))
        return self.output(x)


class SAPPOAgent:
    def __init__(self, state_dim, action_dim, discrete=True, lr=3e-4, gamma=0.99, lam=0.95, eps_clip=0.2, k_epochs=4, sgld_steps=10, sgld_lr=0.01):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Actor and critic networks
        self.policy_net = PolicyNetwork(state_dim, action_dim, discrete).to(self.device)
        self.value_net = ValueNetwork(state_dim).to(self.device)

        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)

        self.gamma = gamma
        self.lam = lam
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs

        self.sgld_steps = sgld_steps
        self.sgld_lr = sgld_lr

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        if self.policy_net.discrete:
            probs = self.policy_net(state)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()
            return action.item(), dist.log_prob(action)
        else:
            mean, std = self.policy_net(state)
            dist = torch.distributions.Normal(mean, std)
            action = dist.sample()
            return action.cpu().numpy(), dist.log_prob(action).sum()

    def compute_gae(self, rewards, values, dones):
        advantages = []
        advantage = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
            advantage = delta + self.gamma * self.lam * (1 - dones[t]) * advantage
            advantages.insert(0, advantage)
        return torch.tensor(advantages, device=self.device, dtype=torch.float32)

    def sgld_step(self, state, epsilon):
        """Perform Stochastic Gradient Langevin Dynamics (SGLD) to generate perturbed states."""
        perturbed_state = state.clone().detach().to(self.device).requires_grad_(True)
    
        for _ in range(self.sgld_steps):
            if perturbed_state.grad is not None:
                perturbed_state.grad.zero_()
    
            # Compute KL divergence between original and perturbed policies
            with torch.no_grad():
                original_logits = self.policy_net(state)
            perturbed_logits = self.policy_net(perturbed_state)
    
            if self.policy_net.discrete:
                original_policy = dist.Categorical(original_logits)
                perturbed_policy = dist.Categorical(perturbed_logits)
            else:
                original_mean, original_std = original_logits
                perturbed_mean, perturbed_std = perturbed_logits
                original_policy = dist.Normal(original_mean, original_std)
                perturbed_policy = dist.Normal(perturbed_mean, perturbed_std)
    
            kl_div = dist.kl.kl_divergence(original_policy, perturbed_policy).mean()
    
            # Backpropagate KL divergence
            kl_div.backward()
    
            # Update perturbed state using gradient and noise
            perturbed_state = perturbed_state + epsilon * perturbed_state.grad + torch.randn_like(perturbed_state) * epsilon
            perturbed_state = perturbed_state.detach().clone().requires_grad_(True)
    
        return perturbed_state.detach()

    def compute_kl_regularization(self, states, actions):
        """Compute the KL divergence regularization across all states."""
        if len(states) == 0:
            return torch.tensor(0.0, device=self.device)
    
        kl_div_total = 0
        for state in states:
            perturbed_state = self.sgld_step(state, self.sgld_lr)
    
            with torch.no_grad():
                original_logits = self.policy_net(state)
            perturbed_logits = self.policy_net(perturbed_state)
    
            if self.policy_net.discrete:
                original_policy = dist.Categorical(original_logits)
                perturbed_policy = dist.Categorical(perturbed_logits)
            else:
                original_mean, original_std = original_logits
                perturbed_mean, perturbed_std = perturbed_logits
                original_policy = dist.Normal(original_mean, original_std)
                perturbed_policy = dist.Normal(perturbed_mean, perturbed_std)
    
            kl_div = dist.kl.kl_divergence(original_policy, perturbed_policy).mean()
            kl_div_total += kl_div
    
        return kl_div_total / len(states)
    

    def train(self, env, max_episodes=1000, rollout_steps=2048, batch_size=64):
        for episode in range(max_episodes):
            states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
    
            # Reset the environment
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
    
            # Rollout phase: Collect trajectories
            for _ in range(rollout_steps):
                value = self.value_net(state).squeeze(0).detach()  # Detach the value tensor
                action, log_prob = self.select_action(state.cpu().numpy())
    
                next_state, reward, done, truncated, _ = env.step(action)
                
                # Append data to lists
                states.append(state.clone().detach())
                actions.append(action)
                rewards.append(reward)
                dones.append(done or truncated)
                log_probs.append(log_prob.clone().detach())
                values.append(value)
    
                # Update state
                state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                if done or truncated:
                    state, _ = env.reset()
                    state = torch.tensor(state, dtype=torch.float32).to(self.device)
    
            # Add a final value estimate
            values.append(torch.tensor([0], device=self.device).detach())
    
            # Compute advantages and returns
            advantages = self.compute_gae(rewards, values, dones)
            returns = advantages + torch.tensor(values[:-1], device=self.device)

            # Convert lists to tensors
            states = torch.stack(states).to(self.device)
            actions = torch.tensor(
                np.array(actions),
                dtype=torch.float32 if not self.policy_net.discrete else torch.long
            ).to(self.device)
            log_probs = torch.stack(log_probs).to(self.device)
    
            # Optimization phase
            for _ in range(self.k_epochs):
                kl_reg = self.compute_kl_regularization(states, actions)
    
                for i in range(0, rollout_steps, batch_size):
                    batch_states = states[i:i + batch_size]
                    batch_actions = actions[i:i + batch_size]
                    batch_log_probs = log_probs[i:i + batch_size]
                    batch_advantages = advantages[i:i + batch_size]
                    batch_returns = returns[i:i + batch_size]

                    if self.policy_net.discrete:
                        action_probs = self.policy_net(batch_states)
                        dist = torch.distributions.Categorical(action_probs)
                        new_log_probs = dist.log_prob(batch_actions)
                    else:
                        mean, std = self.policy_net(batch_states)
                        dist = torch.distributions.Normal(mean, std)
                        new_log_probs = dist.log_prob(batch_actions).sum(dim=-1)
    
                    ratio = torch.exp(new_log_probs - batch_log_probs)
                    surr1 = ratio * batch_advantages
                    surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                    policy_loss = -torch.min(surr1, surr2).mean()
    
                    value_preds = self.value_net(batch_states).squeeze(-1)
                    value_loss = nn.MSELoss()(value_preds, batch_returns)

                    # Detach kl_reg to prevent graph accumulation
                    kl_reg = kl_reg.detach()
    
                    total_loss = policy_loss + 0.5 * value_loss + 0.01 * kl_reg
    
                    self.policy_optimizer.zero_grad()
                    self.value_optimizer.zero_grad()
                    total_loss.backward(retain_graph=False)  # No need to retain the graph here
                    self.policy_optimizer.step()
                    self.value_optimizer.step()
    
            print(f"Episode {episode + 1}: Policy Loss = {policy_loss.item()}, Value Loss = {value_loss.item()}, KL Reg = {kl_reg.item()}")
    
                            


In [19]:
if __name__ == "__main__":
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    discrete = False  # Set to True if action space is discrete
    
    RobustAgent = SAPPOAgent(state_dim, action_dim, discrete)
    RobustAgent.train(env, max_episodes=100)


Episode 1: Policy Loss = -8.539791107177734, Value Loss = 91.50924682617188, KL Reg = 1.0124143955181353e-05
Episode 2: Policy Loss = -14.921133041381836, Value Loss = 221.65118408203125, KL Reg = 1.2055054867232684e-05
Episode 3: Policy Loss = -4.782430648803711, Value Loss = 120.10716247558594, KL Reg = 1.5467019693460315e-05
Episode 4: Policy Loss = -6.200763702392578, Value Loss = 64.98123168945312, KL Reg = 2.2710195480613038e-05
Episode 5: Policy Loss = -9.923398971557617, Value Loss = 157.38397216796875, KL Reg = 3.756040314328857e-05
Episode 6: Policy Loss = -10.554612159729004, Value Loss = 216.71499633789062, KL Reg = 5.482680717250332e-05
Episode 7: Policy Loss = -10.816160202026367, Value Loss = 239.9801025390625, KL Reg = 7.194499630713835e-05
Episode 8: Policy Loss = -8.576520919799805, Value Loss = 209.21029663085938, KL Reg = 9.797522943699732e-05
Episode 9: Policy Loss = -5.71106481552124, Value Loss = 173.89205932617188, KL Reg = 0.0001289803913095966
Episode 10: Poli

In [20]:
# Initialize the environment
env = gym.make("Hopper-v4")

# Evaluate the agent using the trained policy network
average_reward = evaluate_agent(env, RobustAgent.policy_net, num_episodes=200)


Episode 1: Reward = 578.1060141282485
Episode 2: Reward = 191.48752509190723
Episode 3: Reward = 516.8291325792161
Episode 4: Reward = 518.6161120277076
Episode 5: Reward = 559.901385298283
Episode 6: Reward = 319.5635004596396
Episode 7: Reward = 140.9922717474108
Episode 8: Reward = 182.01958497695213
Episode 9: Reward = 674.3009771136836
Episode 10: Reward = 263.18558627778674
Episode 11: Reward = 374.87454495694556
Episode 12: Reward = 318.9757919315372
Episode 13: Reward = 126.44828207642213
Episode 14: Reward = 518.5822160722652
Episode 15: Reward = 644.7057108556683
Episode 16: Reward = 632.5270357631548
Episode 17: Reward = 564.3636714239185
Episode 18: Reward = 664.2489390736816
Episode 19: Reward = 573.6683301090893
Episode 20: Reward = 305.52238955232593
Episode 21: Reward = 611.2088911674298
Episode 22: Reward = 354.85071686288865
Episode 23: Reward = 330.75717447749935
Episode 24: Reward = 511.43509659663096
Episode 25: Reward = 348.41043835437176
Episode 26: Reward = 643.

In [21]:
import torch
import numpy as np
import gymnasium as gym

def random_perturbation(state, epsilon):
    """
    Apply random perturbation to the state.
    Args:
        state: The original state.
        epsilon: The maximum magnitude of random noise.
    Returns:
        Perturbed state.
    """
    noise = np.random.uniform(-epsilon, epsilon, size=state.shape)
    perturbed_state = state + noise
    return perturbed_state

def evaluate_agent_with_random_attack(env, policy_net, epsilon=5, num_episodes=200):
    """
    Evaluate the agent with random perturbation applied to states during testing.
    Args:
        env: The environment to test the agent.
        policy_net: The trained policy network.
        epsilon: Maximum magnitude of random noise for perturbation.
        num_episodes: Number of episodes for evaluation.
    Returns:
        Average reward over the episodes.
    """
    # Ensure policy network is on the same device as input tensors
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net.to(device)
    policy_net.eval()  # Set the network to evaluation mode

    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):  # Handle Gymnasium's (observation, info) format
            state = state[0]
        episode_reward = 0
        done = False

        while not done:
            # Apply random perturbation to the state
            perturbed_state = random_perturbation(state, epsilon)

            # Convert perturbed state to tensor and send to the same device as the policy network
            state_tensor = torch.tensor(perturbed_state, dtype=torch.float32, device=device).unsqueeze(0)

            # Get action from the policy network
            with torch.no_grad():
                policy_output = policy_net(state_tensor)
                # If the policy network outputs a tuple, extract the action vector
                if isinstance(policy_output, tuple):
                    action = policy_output[0].cpu().numpy().squeeze()
                else:
                    action = policy_output.cpu().numpy().squeeze()

            # Take the action in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            episode_reward += reward
            state = next_state

        total_reward += episode_reward
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

# Example usage
env = gym.make("Hopper-v4")
policy_net = RobustAgent.policy_net  # Use your trained policy network here

epsilon = 0.1  # Maximum perturbation magnitude
evaluate_agent_with_random_attack(env, policy_net, epsilon)


Episode 1: Reward = 496.5445431706002
Episode 2: Reward = 627.907378253641
Episode 3: Reward = 631.8893375377107
Episode 4: Reward = 634.9780713895062
Episode 5: Reward = 637.997521321534
Episode 6: Reward = 612.4174651703072
Episode 7: Reward = 638.3541925099875
Episode 8: Reward = 630.1991384244017
Episode 9: Reward = 631.4799199351878
Episode 10: Reward = 651.6785464018815
Episode 11: Reward = 632.7606364388131
Episode 12: Reward = 632.957587240793
Episode 13: Reward = 621.6552098302561
Episode 14: Reward = 643.6775720853248
Episode 15: Reward = 647.0049766545009
Episode 16: Reward = 636.8676021049968
Episode 17: Reward = 617.4209921512115
Episode 18: Reward = 630.6379954478923
Episode 19: Reward = 630.217643606485
Episode 20: Reward = 632.8507157833255
Episode 21: Reward = 644.7251352426781
Episode 22: Reward = 640.5816812446527
Episode 23: Reward = 632.7348806005328
Episode 24: Reward = 627.3222975359677
Episode 25: Reward = 640.9199106153292
Episode 26: Reward = 580.0525721600941

625.8473071704427

In [22]:
# Example usage
env = gym.make("Hopper-v4")

# Initialize the trained policy network
policy_net = RobustAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Evaluate the policy under MAD attack
average_reward = evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=attack_steps, step_epsilon=step_epsilon)
print(f"Final Average Reward under MAD Attack: {average_reward}")


Episode 1/200: Reward = 318.07371935394053
Episode 2/200: Reward = 367.1858283355137
Episode 3/200: Reward = 327.863413739362
Episode 4/200: Reward = 299.5880341788426
Episode 5/200: Reward = 461.847489339165
Episode 6/200: Reward = 323.335270603237
Episode 7/200: Reward = 452.83587682220315
Episode 8/200: Reward = 637.105368294214
Episode 9/200: Reward = 199.4958548565722
Episode 10/200: Reward = 514.0955506078654
Episode 11/200: Reward = 230.7348997691168
Episode 12/200: Reward = 629.7321137042236
Episode 13/200: Reward = 516.6775970784363
Episode 14/200: Reward = 328.4048491459208
Episode 15/200: Reward = 960.4494275438652
Episode 16/200: Reward = 308.38990097317725
Episode 17/200: Reward = 483.1248303359048
Episode 18/200: Reward = 579.5756659889504
Episode 19/200: Reward = 527.8560888025761
Episode 20/200: Reward = 376.3116510315786
Episode 21/200: Reward = 340.7283630567716
Episode 22/200: Reward = 591.6572040286092
Episode 23/200: Reward = 331.5584265904711
Episode 24/200: Rewar

In [23]:
# Example usage
env = gym.make("Hopper-v4")

# Initialize the trained policy network
policy_net = RobustAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step
epsilon_schedule = [0.01 * i for i in range(1, 101)]
# Evaluate the policy under MAD attack

robust_q_net=robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    epsilon_schedule=epsilon_schedule,
    num_steps=5000,        # Number of training steps
    lambda_rs=0.1,         # Regularization parameter for robust loss
    batch_size=64,         # Batch size for training
    gamma=0.99             # Discount factor
)


Step 0/5000, TD Loss: 10.3163, Robust Loss: 0.0000
Step 100/5000, TD Loss: 11.3099, Robust Loss: 0.1245
Step 200/5000, TD Loss: 12.5339, Robust Loss: 0.4999
Step 300/5000, TD Loss: 11.3194, Robust Loss: 1.4924
Step 400/5000, TD Loss: 9.2221, Robust Loss: 1.7934
Step 500/5000, TD Loss: 10.8289, Robust Loss: 2.2487
Step 600/5000, TD Loss: 8.1078, Robust Loss: 2.4350
Step 700/5000, TD Loss: 8.9582, Robust Loss: 3.9284
Step 800/5000, TD Loss: 7.0487, Robust Loss: 6.0340
Step 900/5000, TD Loss: 7.7164, Robust Loss: 6.7105
Step 1000/5000, TD Loss: 5.0359, Robust Loss: 7.2438
Step 1100/5000, TD Loss: 6.2030, Robust Loss: 12.0028
Step 1200/5000, TD Loss: 5.2381, Robust Loss: 8.9190
Step 1300/5000, TD Loss: 4.9883, Robust Loss: 9.8233
Step 1400/5000, TD Loss: 5.3557, Robust Loss: 13.8114
Step 1500/5000, TD Loss: 4.9896, Robust Loss: 14.0494
Step 1600/5000, TD Loss: 5.3772, Robust Loss: 11.3728
Step 1700/5000, TD Loss: 5.4830, Robust Loss: 7.4291
Step 1800/5000, TD Loss: 4.2205, Robust Loss: 17.

In [24]:
average_reward = average_reward = evaluate_agent_with_robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    robust_q_net=robust_q_net,
    epsilon=0.05,
    num_episodes=200,
    step_size=0.01
)
print(f"Final Average Reward under Robust Sarsa Attack: {average_reward}")


Episode 1/200: Reward = 588.9364825010562
Episode 2/200: Reward = 639.4635806012512
Episode 3/200: Reward = 640.0339782348864
Episode 4/200: Reward = 636.8171960698895
Episode 5/200: Reward = 617.270314288993
Episode 6/200: Reward = 595.5929024452387
Episode 7/200: Reward = 628.8025170898864
Episode 8/200: Reward = 641.3431688009284
Episode 9/200: Reward = 603.1333039306372
Episode 10/200: Reward = 634.992165146953
Episode 11/200: Reward = 608.2545209168555
Episode 12/200: Reward = 527.6472692233533
Episode 13/200: Reward = 631.2413129989108
Episode 14/200: Reward = 652.1245097791393
Episode 15/200: Reward = 633.0383636754088
Episode 16/200: Reward = 651.2701865732529
Episode 17/200: Reward = 616.7933325307457
Episode 18/200: Reward = 637.517894604106
Episode 19/200: Reward = 641.2725114977931
Episode 20/200: Reward = 646.8306950266438
Episode 21/200: Reward = 636.332520118184
Episode 22/200: Reward = 636.4219753484267
Episode 23/200: Reward = 585.2761016767313
Episode 24/200: Reward =

In [25]:
def evaluate_agent_with_state_value_attack(env, policy_net, value_net, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01):
    """
    Evaluate the agent under a State Value Attack using a value network.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        value_net (torch.nn.Module): The trained value network.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.

    Returns:
        float: Average reward over the episodes under the state value attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Start with the original state
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Compute value for the perturbed state
                value = value_net(perturbed_state.unsqueeze(0))

                # Minimize or maximize the value
                loss = -value.mean()  # Gradient ascent to maximize adversarial effect
                loss.backward()

                # Apply gradient-based perturbation
                grad = perturbed_state.grad
                perturbation = step_epsilon * grad.sign()
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state to within the epsilon-ball
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                action_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(action_output, tuple):
                    action = action_output[0]  # Extract mean for continuous actions
                else:
                    action = action_output

                action = action.squeeze().cpu().numpy()  # Ensure the action is in NumPy format

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under State Value Attack: {average_reward}")
    return average_reward


In [26]:
def evaluate_agent_with_target_policy_attack(env, policy_net, target_action, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01):
    """
    Evaluate the agent under a Target Policy Misclassification attack.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        target_action (torch.Tensor): The target action to force the policy to output.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.

    Returns:
        float: Average reward over the episodes under Target Policy Misclassification attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Get policy output for the perturbed state
                policy_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(env.action_space, gym.spaces.Discrete):
                    logits = policy_output  # For discrete actions
                    loss = torch.nn.functional.cross_entropy(logits, target_action)  # Cross-entropy loss
                elif isinstance(env.action_space, gym.spaces.Box):
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output  # Mean and std
                    else:
                        mean = policy_output
                    loss = torch.nn.functional.mse_loss(mean, target_action)  # MSE loss for continuous actions
                else:
                    raise ValueError("Unsupported action space type.")

                # Backpropagate to compute gradients
                loss.backward()

                # Apply gradient-based perturbation
                grad = perturbed_state.grad
                perturbation = step_epsilon * grad.sign()
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                policy_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(env.action_space, gym.spaces.Discrete):
                    action = torch.argmax(policy_output, dim=1).item()  # Discrete action
                else:
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output
                    else:
                        mean = policy_output
                    action = mean.squeeze().cpu().numpy()  # Continuous action

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            episode_reward += reward
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under Target Policy Misclassification attack: {average_reward}")
    return average_reward


In [27]:
# Assuming `policy_net` and `q_net` are already defined and trained
# Example environment
import gymnasium as gym
env = gym.make("Hopper-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Call the attack evaluation function
average_reward_sav = evaluate_agent_with_state_value_attack(
    env=env,
    policy_net=RobustAgent.policy_net,  # Trained policy network
    value_net=RobustAgent.value_net,  # Trained Q-value network (critic)
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under State Action Value Attack: {average_reward_sav}")


Episode 1/200: Reward = 528.5545755389755
Episode 2/200: Reward = 347.06878834792644
Episode 3/200: Reward = 449.841541716543
Episode 4/200: Reward = 334.852160727421
Episode 5/200: Reward = 449.82095804270386
Episode 6/200: Reward = 477.6854111164778
Episode 7/200: Reward = 543.3728646666646
Episode 8/200: Reward = 508.9143774322205
Episode 9/200: Reward = 434.0940030937651
Episode 10/200: Reward = 478.15810736111825
Episode 11/200: Reward = 528.4901685068223
Episode 12/200: Reward = 286.3512783842519
Episode 13/200: Reward = 352.2279821936611
Episode 14/200: Reward = 465.85148986936485
Episode 15/200: Reward = 337.39294680939804
Episode 16/200: Reward = 539.416360488241
Episode 17/200: Reward = 444.51221159340434
Episode 18/200: Reward = 501.67749701286044
Episode 19/200: Reward = 306.227921734185
Episode 20/200: Reward = 483.16695168524853
Episode 21/200: Reward = 516.9780774655194
Episode 22/200: Reward = 268.4419432589775
Episode 23/200: Reward = 452.4459223901904
Episode 24/200: 

In [28]:
# Assuming `policy_net` and `q_net` are already defined and trained
# Example environment
import gymnasium as gym
env = gym.make("Hopper-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Call the attack evaluation function
average_reward_sav = evaluate_agent_with_state_value_attack(
    env=env,
    policy_net=VanillaAgent.policy_net,  # Trained policy network
    value_net=VanillaAgent.value_net,  # Trained Q-value network (critic)
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under State Action Value Attack: {average_reward_sav}")


Episode 1/200: Reward = 693.1901192295966
Episode 2/200: Reward = 943.3643621807067
Episode 3/200: Reward = 815.7333813447087
Episode 4/200: Reward = 1445.3845959036246
Episode 5/200: Reward = 1465.9494087367648
Episode 6/200: Reward = 999.6279886334008
Episode 7/200: Reward = 1032.3364137018925
Episode 8/200: Reward = 1875.0666188422372
Episode 9/200: Reward = 3529.7762874686036
Episode 10/200: Reward = 1412.862719511698
Episode 11/200: Reward = 1138.8154669744006
Episode 12/200: Reward = 788.2967195801293
Episode 13/200: Reward = 808.8026768203479
Episode 14/200: Reward = 1010.8265830504154
Episode 15/200: Reward = 811.7728874268169
Episode 16/200: Reward = 1415.5361181574804
Episode 17/200: Reward = 1086.909832051973
Episode 18/200: Reward = 803.7502712284085
Episode 19/200: Reward = 790.1769677103415
Episode 20/200: Reward = 759.5326387876972
Episode 21/200: Reward = 1258.6111460251423
Episode 22/200: Reward = 790.5083134785166
Episode 23/200: Reward = 789.743623411393
Episode 24/2

In [29]:
# Assuming `policy_net` is already defined and trained
# Example environment
env = gym.make("Hopper-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Define the target action for the misclassification attack
# For discrete actions: target_action is the action index
if isinstance(env.action_space, gym.spaces.Discrete):
    target_action = torch.tensor([1], dtype=torch.long).to(next(VanillaAgent.policy_net.parameters()).device)
# For continuous actions: target_action is a vector of desired action values
elif isinstance(env.action_space, gym.spaces.Box):
    target_action = torch.tensor([0.5] * env.action_space.shape[0], dtype=torch.float32).to(next(VanillaAgent.policy_net.parameters()).device)
else:
    raise ValueError("Unsupported action space type.")

# Call the attack evaluation function
average_reward_tpm = evaluate_agent_with_target_policy_attack(
    env=env,
    policy_net=RobustAgent.policy_net,  # Trained policy network
    target_action=target_action,
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under Target Policy Misclassification Attack: {average_reward_tpm}")


  loss = torch.nn.functional.mse_loss(mean, target_action)  # MSE loss for continuous actions


Episode 1/200: Reward = 540.8979549206526
Episode 2/200: Reward = 291.45180702343197
Episode 3/200: Reward = 572.5371309498485
Episode 4/200: Reward = 530.4878440667834
Episode 5/200: Reward = 330.52730376023527
Episode 6/200: Reward = 275.5857559463875
Episode 7/200: Reward = 362.19963430002804
Episode 8/200: Reward = 552.1260354189199
Episode 9/200: Reward = 528.6464012847537
Episode 10/200: Reward = 471.75512259478666
Episode 11/200: Reward = 339.9922232510131
Episode 12/200: Reward = 284.2048776719928
Episode 13/200: Reward = 339.89770776161885
Episode 14/200: Reward = 342.9773512187943
Episode 15/200: Reward = 582.8833178046004
Episode 16/200: Reward = 522.3506287805708
Episode 17/200: Reward = 405.1830865614185
Episode 18/200: Reward = 538.3877261843035
Episode 19/200: Reward = 465.85547333297325
Episode 20/200: Reward = 539.268505708799
Episode 21/200: Reward = 561.9085265823605
Episode 22/200: Reward = 572.0256215262482
Episode 23/200: Reward = 529.0603065444302
Episode 24/200:

In [30]:
# Assuming `policy_net` is already defined and trained
# Example environment
env = gym.make("Hopper-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Define the target action for the misclassification attack
# For discrete actions: target_action is the action index
if isinstance(env.action_space, gym.spaces.Discrete):
    target_action = torch.tensor([1], dtype=torch.long).to(next(VanillaAgent.policy_net.parameters()).device)
# For continuous actions: target_action is a vector of desired action values
elif isinstance(env.action_space, gym.spaces.Box):
    target_action = torch.tensor([0.5] * env.action_space.shape[0], dtype=torch.float32).to(next(VanillaAgent.policy_net.parameters()).device)
else:
    raise ValueError("Unsupported action space type.")

# Call the attack evaluation function
average_reward_tpm = evaluate_agent_with_target_policy_attack(
    env=env,
    policy_net=VanillaAgent.policy_net,  # Trained policy network
    target_action=target_action,
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under Target Policy Misclassification Attack: {average_reward_tpm}")


  loss = torch.nn.functional.mse_loss(mean, target_action)  # MSE loss for continuous actions


Episode 1/200: Reward = 851.2155009095103
Episode 2/200: Reward = 446.6654249788073
Episode 3/200: Reward = 894.1109610415114
Episode 4/200: Reward = 709.7281598350978
Episode 5/200: Reward = 447.6573526450699
Episode 6/200: Reward = 750.4678470816741
Episode 7/200: Reward = 759.936914581194
Episode 8/200: Reward = 445.7425048354033
Episode 9/200: Reward = 448.50549179633003
Episode 10/200: Reward = 812.0598545110121
Episode 11/200: Reward = 893.5769316810646
Episode 12/200: Reward = 448.49509921332367
Episode 13/200: Reward = 850.2912963774294
Episode 14/200: Reward = 447.80719252713385
Episode 15/200: Reward = 882.3931046516011
Episode 16/200: Reward = 444.147921032004
Episode 17/200: Reward = 451.4708515160012
Episode 18/200: Reward = 449.5205657391279
Episode 19/200: Reward = 448.6740457840399
Episode 20/200: Reward = 449.0255019355837
Episode 21/200: Reward = 459.19684671072906
Episode 22/200: Reward = 447.3894553482221
Episode 23/200: Reward = 455.83682885436747
Episode 24/200: R