In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
%pip install gymnasium[mujoco]

Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Downloading mujoco-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m

In [2]:
import numpy as np
import gymnasium as gym
import mujoco
import torch
import torch.nn as nn
import torch.optim as optim

import random
# Initialize the Walker2d environment
env = gym.make("Walker2d-v4")



In [4]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# # Simplified Value Network
# class ValueNetwork(nn.Module):
#     def __init__(self, state_dim, hidden_sizes=(64, 64), activation=nn.Tanh):
#         super(ValueNetwork, self).__init__()
#         self.activation = activation()
#         self.layers = nn.ModuleList()
        
#         # Hidden layers
#         input_size = state_dim
#         for hidden_size in hidden_sizes:
#             self.layers.append(nn.Linear(input_size, hidden_size))
#             input_size = hidden_size

#         # Output layer
#         self.output_layer = nn.Linear(input_size, 1)

#     def forward(self, state):
#         x = state
#         for layer in self.layers:
#             x = self.activation(layer(x))
#         value = self.output_layer(x)
#         return value


# # Simplified Policy Network
# class PolicyNetwork(nn.Module):
#     def __init__(self, state_dim, action_dim, discrete=True, hidden_sizes=(64, 64), activation=nn.Tanh):
#         super(PolicyNetwork, self).__init__()
#         self.discrete = discrete
#         self.activation = activation()
#         self.layers = nn.ModuleList()
        
#         # Hidden layers
#         input_size = state_dim
#         for hidden_size in hidden_sizes:
#             self.layers.append(nn.Linear(input_size, hidden_size))
#             input_size = hidden_size

#         # Output layer
#         if self.discrete:
#             # Discrete actions: output probabilities for each action
#             self.output_layer = nn.Linear(input_size, action_dim)
#         else:
#             # Continuous actions: output mean and log_std for each action
#             self.mean_layer = nn.Linear(input_size, action_dim)
#             self.log_std = nn.Parameter(torch.zeros(action_dim))

#     def forward(self, state):
#         x = state
#         for layer in self.layers:
#             x = self.activation(layer(x))
        
#         if self.discrete:
#             # Discrete actions: apply softmax for probabilities
#             logits = self.output_layer(x)
#             action_probs = F.softmax(logits, dim=-1)
#             return action_probs
#         else:
#             # Continuous actions: return mean and std
#             mean = self.mean_layer(x)
#             std = torch.exp(self.log_std)
#             return mean, std


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Simplified Value Network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_sizes=(64, 64), activation=nn.Tanh):
        super(ValueNetwork, self).__init__()
        self.activation = activation()
        self.layers = nn.ModuleList()
        
        # Hidden layers
        input_size = state_dim
        for hidden_size in hidden_sizes:
            self.layers.append(nn.Linear(input_size, hidden_size))
            input_size = hidden_size

        # Output layer
        self.output_layer = nn.Linear(input_size, 1)

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = self.activation(layer(x))
        value = self.output_layer(x)
        return value


# Simplified Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, discrete=True, hidden_sizes=(64, 64), activation=nn.Tanh):
        super(PolicyNetwork, self).__init__()
        self.discrete = discrete
        self.activation = activation()
        self.layers = nn.ModuleList()
        
        # Hidden layers
        input_size = state_dim
        for hidden_size in hidden_sizes:
            self.layers.append(nn.Linear(input_size, hidden_size))
            input_size = hidden_size

        # Output layer
        if self.discrete:
            # Discrete actions: output probabilities for each action
            self.output_layer = nn.Linear(input_size, action_dim)
        else:
            # Continuous actions: output mean and log_std for each action
            self.mean_layer = nn.Linear(input_size, action_dim)
            self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, state):
        x = state
        for layer in self.layers:
            x = self.activation(layer(x))
        
        if self.discrete:
            # Discrete actions: apply softmax for probabilities
            logits = self.output_layer(x)
            action_probs = F.softmax(logits, dim=-1)
            return action_probs
        else:
            # Continuous actions: return mean and std
            mean = self.mean_layer(x)
            std = torch.exp(self.log_std)
            return mean, std

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

class PPOAgent:
    def __init__(self, state_dim, action_dim, discrete, gamma=0.99, lam=0.95, eps_clip=0.2, lr=4e-4, k_epochs=4):
        self.gamma = gamma  # Discount factor
        self.lam = lam  # GAE lambda
        self.eps_clip = eps_clip  # Clipping epsilon
        self.k_epochs = k_epochs  # Number of PPO epochs
        
        self.policy_net = PolicyNetwork(state_dim, action_dim, discrete).to("cuda" if torch.cuda.is_available() else "cpu")
        self.value_net = ValueNetwork(state_dim).to("cuda" if torch.cuda.is_available() else "cpu")
        
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        if isinstance(self.policy_net, PolicyNetwork) and self.policy_net.discrete:
            action_probs = self.policy_net(state)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            return action.item(), action_dist.log_prob(action)
        else:
            mean, std = self.policy_net(state)
            action_dist = torch.distributions.Normal(mean, std)
            action = action_dist.sample()
            return action.cpu().numpy(), action_dist.log_prob(action).sum()

    def compute_advantages(self, rewards, values, dones):
        advantages = []
        advantage = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
            advantage = delta + self.gamma * self.lam * (1 - dones[t]) * advantage
            advantages.insert(0, advantage)
        return torch.tensor(advantages, dtype=torch.float32, device=self.device)

    def train(self, env, max_episodes=1000, rollout_steps=2048, batch_size=64):
        for episode in range(max_episodes):
            # Initialize trajectory variables
            states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
            
            # Reset environment and get the initial state
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            
            # Collect trajectories
            for _ in range(rollout_steps):
                with torch.no_grad():
                    value = self.value_net(state).squeeze(0)
                    action, log_prob = self.select_action(state.cpu().numpy())
                
                # Interact with the environment
                next_state, reward, done, truncated, _ = env.step(action)
                
                # Store trajectory data
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                dones.append(done or truncated)
                log_probs.append(log_prob)
                values.append(value)
                
                # Update the state
                state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                if done or truncated:
                    state, _ = env.reset()
                    state = torch.tensor(state, dtype=torch.float32).to(self.device)
            
            # Ensure valid trajectory data
            if len(states) == 0:
                print("No valid states collected; skipping this episode.")
                continue
            
            # Compute advantages and returns
            values.append(torch.tensor([0], device=self.device))  # Bootstrap value
            advantages = self.compute_advantages(rewards, values, dones)
            returns = advantages + torch.tensor(values[:-1], device=self.device)
            
            # Optimize policy and value networks
            states = torch.stack(states).to(self.device)
            actions = torch.tensor(np.array(actions), dtype=torch.float32 if not self.policy_net.discrete else torch.long).to(self.device)
            log_probs = torch.stack(log_probs).to(self.device)
        
            for _ in range(self.k_epochs):
                for i in range(0, rollout_steps, batch_size):
                    batch_states = states[i:i+batch_size]
                    batch_actions = actions[i:i+batch_size]
                    batch_log_probs = log_probs[i:i+batch_size]
                    batch_advantages = advantages[i:i+batch_size]
                    batch_returns = returns[i:i+batch_size]
    
                    # Policy update
                    if self.policy_net.discrete:
                        # Discrete action space
                        action_probs = self.policy_net(batch_states)
                        dist = torch.distributions.Categorical(action_probs)
                        new_log_probs = dist.log_prob(batch_actions)
                    else:
                        # Continuous action space
                        mean, std = self.policy_net(batch_states)
                        dist = torch.distributions.Normal(mean, std)
                        new_log_probs = dist.log_prob(batch_actions).sum(dim=-1)
    
                    # PPO objective
                    ratio = torch.exp(new_log_probs - batch_log_probs)
                    surr1 = ratio * batch_advantages
                    surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                    policy_loss = -torch.min(surr1, surr2).mean()

                    # Value update
                    value_preds = self.value_net(batch_states).squeeze(-1)
                    value_loss = nn.MSELoss()(value_preds, batch_returns)
    
                    # Backpropagation
                    self.policy_optimizer.zero_grad()
                    self.value_optimizer.zero_grad()
                    (policy_loss + 0.5 * value_loss).backward()
                    self.policy_optimizer.step()
                    self.value_optimizer.step()
    
            print(f"Episode {episode + 1}: Policy Loss = {policy_loss.item()}, Value Loss = {value_loss.item()}")

        
       


In [6]:
if __name__ == "__main__":
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    discrete = False  # Set to True if action space is discrete
    
    VanillaAgent = PPOAgent(state_dim, action_dim, discrete)
    VanillaAgent.train(env, max_episodes=100)


Episode 1: Policy Loss = -0.711169958114624, Value Loss = 3.713970184326172
Episode 2: Policy Loss = 0.16402876377105713, Value Loss = 18.072729110717773
Episode 3: Policy Loss = -4.6198930740356445, Value Loss = 26.06911277770996
Episode 4: Policy Loss = -0.7623547315597534, Value Loss = 31.41605567932129
Episode 5: Policy Loss = -5.892380237579346, Value Loss = 105.7132797241211
Episode 6: Policy Loss = -7.026723861694336, Value Loss = 64.5262451171875
Episode 7: Policy Loss = -8.109270095825195, Value Loss = 125.22000122070312
Episode 8: Policy Loss = 7.952657222747803, Value Loss = 62.905513763427734
Episode 9: Policy Loss = 3.5639991760253906, Value Loss = 96.91584014892578
Episode 10: Policy Loss = -18.898263931274414, Value Loss = 430.29791259765625
Episode 11: Policy Loss = 3.8940212726593018, Value Loss = 111.63114929199219
Episode 12: Policy Loss = 2.5102245807647705, Value Loss = 84.9892578125
Episode 13: Policy Loss = 5.120128631591797, Value Loss = 148.0777587890625
Episod

In [6]:
def evaluate_agent_with_state_value_attack(env, policy_net, value_net, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01):
    """
    Evaluate the agent under a State Value Attack using a value network.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        value_net (torch.nn.Module): The trained value network.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.

    Returns:
        float: Average reward over the episodes under the state value attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Start with the original state
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Compute value for the perturbed state
                value = value_net(perturbed_state.unsqueeze(0))

                # Minimize or maximize the value
                loss = -value.mean()  # Gradient ascent to maximize adversarial effect
                loss.backward()

                # Apply gradient-based perturbation
                grad = perturbed_state.grad
                perturbation = step_epsilon * grad.sign()
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state to within the epsilon-ball
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                action_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(action_output, tuple):
                    action = action_output[0]  # Extract mean for continuous actions
                else:
                    action = action_output

                action = action.squeeze().cpu().numpy()  # Ensure the action is in NumPy format

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under State Value Attack: {average_reward}")
    return average_reward


In [8]:
# Assuming `policy_net` and `q_net` are already defined and trained
# Example environment
import gymnasium as gym
env = gym.make("Walker2d-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Call the attack evaluation function
average_reward_sav = evaluate_agent_with_state_value_attack(
    env=env,
    policy_net=VanillaAgent.policy_net,  # Trained policy network
    value_net=VanillaAgent.value_net,  # Trained Q-value network (critic)
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under State Action Value Attack: {average_reward_sav}")


Episode 1/200: Reward = 277.5040888230642
Episode 2/200: Reward = 275.21869895097734
Episode 3/200: Reward = 280.26230589007884
Episode 4/200: Reward = 267.99155310376904
Episode 5/200: Reward = 274.5132213680195
Episode 6/200: Reward = 273.1316942692741
Episode 7/200: Reward = 275.98681348046676
Episode 8/200: Reward = 277.20342597783485
Episode 9/200: Reward = 277.8727785435308
Episode 10/200: Reward = 271.5660568502503
Episode 11/200: Reward = 267.644295070306
Episode 12/200: Reward = 272.9572465155501
Episode 13/200: Reward = 274.47898012237863
Episode 14/200: Reward = 271.8896562712158
Episode 15/200: Reward = 267.07459428541245
Episode 16/200: Reward = 272.6806804096167
Episode 17/200: Reward = 275.87171640103696
Episode 18/200: Reward = 271.30452303080966
Episode 19/200: Reward = 268.70303329237987
Episode 20/200: Reward = 282.21880840957255
Episode 21/200: Reward = 273.4949409441501
Episode 22/200: Reward = 276.93942956535017
Episode 23/200: Reward = 267.8565779169697
Episode 2

In [7]:
def evaluate_agent_with_target_policy_attack(env, policy_net, target_action, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01):
    """
    Evaluate the agent under a Target Policy Misclassification attack.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        target_action (torch.Tensor): The target action to force the policy to output.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.

    Returns:
        float: Average reward over the episodes under Target Policy Misclassification attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Get policy output for the perturbed state
                policy_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(env.action_space, gym.spaces.Discrete):
                    logits = policy_output  # For discrete actions
                    loss = torch.nn.functional.cross_entropy(logits, target_action)  # Cross-entropy loss
                elif isinstance(env.action_space, gym.spaces.Box):
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output  # Mean and std
                    else:
                        mean = policy_output
                    loss = torch.nn.functional.mse_loss(mean, target_action)  # MSE loss for continuous actions
                else:
                    raise ValueError("Unsupported action space type.")

                # Backpropagate to compute gradients
                loss.backward()

                # Apply gradient-based perturbation
                grad = perturbed_state.grad
                perturbation = step_epsilon * grad.sign()
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                policy_output = policy_net(perturbed_state.unsqueeze(0))
                if isinstance(env.action_space, gym.spaces.Discrete):
                    action = torch.argmax(policy_output, dim=1).item()  # Discrete action
                else:
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output
                    else:
                        mean = policy_output
                    action = mean.squeeze().cpu().numpy()  # Continuous action

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            episode_reward += reward
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under Target Policy Misclassification attack: {average_reward}")
    return average_reward


In [10]:
# Assuming `policy_net` is already defined and trained
# Example environment
env = gym.make("Walker2d-v4")

# Parameters
epsilon = 0.1  # Maximum perturbation magnitude
num_episodes = 200  # Number of episodes to evaluate
attack_steps = 10  # Number of attack gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Define the target action for the misclassification attack
# For discrete actions: target_action is the action index
if isinstance(env.action_space, gym.spaces.Discrete):
    target_action = torch.tensor([1], dtype=torch.long).to(next(VanillaAgent.policy_net.parameters()).device)
# For continuous actions: target_action is a vector of desired action values
elif isinstance(env.action_space, gym.spaces.Box):
    target_action = torch.tensor([0.5] * env.action_space.shape[0], dtype=torch.float32).to(next(VanillaAgent.policy_net.parameters()).device)
else:
    raise ValueError("Unsupported action space type.")

# Call the attack evaluation function
average_reward_tpm = evaluate_agent_with_target_policy_attack(
    env=env,
    policy_net=VanillaAgent.policy_net,  # Trained policy network
    target_action=target_action,
    epsilon=epsilon,
    num_episodes=num_episodes,
    attack_steps=attack_steps,
    step_epsilon=step_epsilon
)

print(f"Average Reward under Target Policy Misclassification Attack: {average_reward_tpm}")


  loss = torch.nn.functional.mse_loss(mean, target_action)  # MSE loss for continuous actions


Episode 1/200: Reward = 294.04799967616464
Episode 2/200: Reward = 302.19924315959685
Episode 3/200: Reward = 295.3919874894358
Episode 4/200: Reward = 297.3427071901092
Episode 5/200: Reward = 305.0504361685314
Episode 6/200: Reward = 301.88588423142096
Episode 7/200: Reward = 292.3706116292908
Episode 8/200: Reward = 286.34070526822785
Episode 9/200: Reward = 291.3673676993464
Episode 10/200: Reward = 293.11689825145464
Episode 11/200: Reward = 301.3684701442525
Episode 12/200: Reward = 294.7610087724757
Episode 13/200: Reward = 292.1902766792224
Episode 14/200: Reward = 293.4538750586081
Episode 15/200: Reward = 294.98247649596715
Episode 16/200: Reward = 286.99936446585286
Episode 17/200: Reward = 298.28207171112064
Episode 18/200: Reward = 290.7818941724389
Episode 19/200: Reward = 299.02443986407354
Episode 20/200: Reward = 290.8173754706158
Episode 21/200: Reward = 301.5903577852143
Episode 22/200: Reward = 301.01717421047283
Episode 23/200: Reward = 289.82356399374964
Episode 2

In [8]:
def evaluate_agent(env, policy_net, num_episodes=200, render=False):
    """
    Evaluates the trained policy network on the environment.

    Args:
    - env: The Gym environment.
    - policy_net: The trained policy network.
    - num_episodes: Number of episodes to evaluate.
    - render: Whether to render the environment during evaluation.

    Returns:
    - Average reward over the evaluated episodes.
    """
    device = next(policy_net.parameters()).device
    total_rewards = []

    for episode in range(num_episodes):
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            state = reset_result[0]
        else:
            state = reset_result

        if not isinstance(state, np.ndarray):
            state = np.array(state, dtype=np.float32)

        episode_reward = 0
        terminated, truncated = False, False

        while not (terminated or truncated):
            if render:
                env.render()

            # Move the state tensor to the same device as the policy network
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

            with torch.no_grad():
                policy_output = policy_net(state_tensor)

                if isinstance(policy_output, tuple):
                    # Assuming (mean, std) for continuous action spaces
                    action_mean, action_std = policy_output
                    action = torch.normal(action_mean, action_std).cpu().numpy()
                else:
                    # Assuming logits for discrete action spaces
                    action_prob = torch.softmax(policy_output, dim=-1)
                    action = torch.argmax(action_prob, dim=-1).cpu().numpy()

            # Squeeze the action to ensure proper shape
            action = action.squeeze()

            # Take a step in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, terminated, truncated, _ = next_step_result
            else:
                next_state, reward, terminated, truncated = next_step_result[:4]

            episode_reward += reward
            state = next_state

        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = sum(total_rewards) / num_episodes
    print(f"Average Reward over {num_episodes} Episodes: {average_reward}")
    return average_reward


In [12]:
# Initialize the environment
env = gym.make("Walker2d-v4")

# Evaluate the agent using the trained policy network
average_reward = evaluate_agent(env, VanillaAgent.policy_net, num_episodes=200)


Episode 1: Reward = 254.40977962990627
Episode 2: Reward = 264.13689669908604
Episode 3: Reward = 299.4865645926969
Episode 4: Reward = 355.0286549288328
Episode 5: Reward = 179.81828760811976
Episode 6: Reward = 236.80741369409384
Episode 7: Reward = 248.10659302611415
Episode 8: Reward = 226.10548908501383
Episode 9: Reward = 227.53613913390663
Episode 10: Reward = 244.3685996700178
Episode 11: Reward = 213.96220880049148
Episode 12: Reward = 236.0869379636241
Episode 13: Reward = 337.2372850092061
Episode 14: Reward = 492.0983238620915
Episode 15: Reward = 246.44593457797646
Episode 16: Reward = 309.2160013779429
Episode 17: Reward = 465.0841666927874
Episode 18: Reward = 429.7256147411774
Episode 19: Reward = 297.4300886102363
Episode 20: Reward = 246.20035126983012
Episode 21: Reward = 271.0104967796694
Episode 22: Reward = 276.7822899219639
Episode 23: Reward = 280.8108561592708
Episode 24: Reward = 253.31546730127414
Episode 25: Reward = 338.21880187422687
Episode 26: Reward = 3

In [9]:
def apply_perturbation(attack_method, state, params, policy_model=None, sarsa_model=None):
    """
    Apply perturbation to the state based on the attack method.

    Args:
        attack_method (str): The type of attack ('robust_sarsa', 'mad', 'random').
        state (torch.Tensor): The current state tensor.
        params (object): Parameters for the attack (e.g., epsilon, steps, etc.).
        policy_model (nn.Module): The policy model (for MAD and Sarsa+MAD).
        sarsa_model (nn.Module): The Sarsa model (for Robust Sarsa).

    Returns:
        torch.Tensor: The perturbed state.
    """
    eps = params.get("epsilon", 0.1)
    steps = params.get("steps", 10)
    step_eps = eps / steps
    clamp_min = state - eps
    clamp_max = state + eps

    if attack_method == "robust_sarsa":
        assert sarsa_model is not None, "Sarsa model is required for Robust Sarsa attack."
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            actions = policy_model(perturbed_state)[0]  # Assuming policy returns action logits
            value = sarsa_model(torch.cat((state, actions), dim=1)).mean(dim=1)
            value.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state - update, clamp_min), clamp_max)
            sarsa_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "mad":
        assert policy_model is not None, "Policy model is required for MAD attack."
        original_action = policy_model(state)[0].detach()
        perturbed_state = state.clone().detach().requires_grad_()
        for _ in range(steps):
            new_action = policy_model(perturbed_state)[0]
            action_diff = ((new_action - original_action) ** 2).sum(dim=1)
            action_diff.backward()
            update = perturbed_state.grad.sign() * step_eps
            perturbed_state.data = torch.min(torch.max(perturbed_state + update, clamp_min), clamp_max)
            policy_model.zero_grad()
        return perturbed_state.detach()

    elif attack_method == "random":
        noise = torch.empty_like(state).uniform_(-eps, eps)
        return (state + noise).detach()

    else:
        raise ValueError(f"Unknown attack method: {attack_method}")

In [7]:
attack_params = {
    "epsilon": 0.1,  # Maximum perturbation magnitude
    "steps": 5,      # Number of iterative steps
}

In [11]:
import torch
import numpy as np
import gymnasium as gym

def random_perturbation(state, epsilon):
    """
    Apply random perturbation to the state.
    Args:
        state: The original state.
        epsilon: The maximum magnitude of random noise.
    Returns:
        Perturbed state.
    """
    noise = np.random.uniform(-epsilon, epsilon, size=state.shape)
    perturbed_state = state + noise
    return perturbed_state

def evaluate_agent_with_random_attack(env, policy_net, epsilon=5, num_episodes=200):
    """
    Evaluate the agent with random perturbation applied to states during testing.
    Args:
        env: The environment to test the agent.
        policy_net: The trained policy network.
        epsilon: Maximum magnitude of random noise for perturbation.
        num_episodes: Number of episodes for evaluation.
    Returns:
        Average reward over the episodes.
    """
    # Ensure policy network is on the same device as input tensors
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net.to(device)
    policy_net.eval()  # Set the network to evaluation mode

    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):  # Handle Gymnasium's (observation, info) format
            state = state[0]
        episode_reward = 0
        done = False

        while not done:
            # Apply random perturbation to the state
            perturbed_state = random_perturbation(state, epsilon)

            # Convert perturbed state to tensor and send to the same device as the policy network
            state_tensor = torch.tensor(perturbed_state, dtype=torch.float32, device=device).unsqueeze(0)

            # Get action from the policy network
            with torch.no_grad():
                policy_output = policy_net(state_tensor)
                # If the policy network outputs a tuple, extract the action vector
                if isinstance(policy_output, tuple):
                    action = policy_output[0].cpu().numpy().squeeze()
                else:
                    action = policy_output.cpu().numpy().squeeze()

            # Take the action in the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            episode_reward += reward
            state = next_state

        total_reward += episode_reward
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

# Example usage
env = gym.make("Walker2d-v4")
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

epsilon = 0.1  # Maximum perturbation magnitude
evaluate_agent_with_random_attack(env, policy_net, epsilon)


In [12]:
import torch
import numpy as np


def evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=10, step_epsilon=0.01, beta=1.0):
    """
    Evaluate the agent under a MAD (Maximizing Action Discrepancy) attack for continuous action spaces.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon (float): Maximum perturbation magnitude.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of gradient steps for the attack.
        step_epsilon (float): Step size for each gradient step.
        beta (float): Inverse temperature parameter for SGLD noise.

    Returns:
        float: Average reward over the episodes under MAD attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Start with the original state
            perturbed_state = state.clone().detach().requires_grad_(True)

            for _ in range(attack_steps):
                # Compute the policy outputs for original and perturbed states
                original_mean, original_std = policy_net(state.unsqueeze(0))
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))

                # Compute KL divergence between original and perturbed distributions
                loss = -torch.distributions.kl.kl_divergence(
                    torch.distributions.Normal(original_mean, original_std),
                    torch.distributions.Normal(perturbed_mean, perturbed_std)
                ).mean()
                loss.backward()

                # Compute gradient and add noise for SGLD
                grad = perturbed_state.grad
                noise = torch.randn_like(perturbed_state) * torch.sqrt(torch.tensor(2 / (beta * step_epsilon), device=perturbed_state.device))
                perturbation = step_epsilon * grad + noise

                # Update the perturbed state
                perturbed_state = (perturbed_state + perturbation).detach().requires_grad_(True)

                # Clamp the perturbed state to within the epsilon-ball
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)

            # Use the perturbed state to select the action
            with torch.no_grad():
                perturbed_mean, perturbed_std = policy_net(perturbed_state.unsqueeze(0))
                action_dist = torch.distributions.Normal(perturbed_mean, perturbed_std)
                action = action_dist.sample().squeeze().cpu().numpy()  # Match expected shape (e.g., (3,) for continuous action)

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under MAD attack: {average_reward}")
    return average_reward


In [17]:
# Example usage
env = gym.make("Walker2d-v4")

# Initialize the trained policy network
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step

# Evaluate the policy under MAD attack
average_reward = evaluate_agent_with_mad_attack(env, policy_net, epsilon, num_episodes=200, attack_steps=attack_steps, step_epsilon=step_epsilon)
print(f"Final Average Reward under MAD Attack: {average_reward}")


Episode 1/200: Reward = 280.819949675743
Episode 2/200: Reward = 277.97796180130507
Episode 3/200: Reward = 227.64199456681746
Episode 4/200: Reward = 318.30939854718036
Episode 5/200: Reward = 235.62580962604096
Episode 6/200: Reward = 265.9183547088665
Episode 7/200: Reward = 269.74162427665283
Episode 8/200: Reward = 233.25828530955715
Episode 9/200: Reward = 292.9691218725707
Episode 10/200: Reward = 272.23674098395117
Episode 11/200: Reward = 288.72159161028
Episode 12/200: Reward = 238.34586970047505
Episode 13/200: Reward = 276.33567558040255
Episode 14/200: Reward = 256.87207741480796
Episode 15/200: Reward = 220.26945101761643
Episode 16/200: Reward = 344.5049774545304
Episode 17/200: Reward = 198.0653803210857
Episode 18/200: Reward = 347.1588328691325
Episode 19/200: Reward = 222.55184856514973
Episode 20/200: Reward = 271.9213460186277
Episode 21/200: Reward = 301.0110247558971
Episode 22/200: Reward = 212.71376774473038
Episode 23/200: Reward = 204.17656403671307
Episode 2

In [13]:
import random

def robust_sarsa_attack(env, policy_net, epsilon_schedule, num_steps=10000, lambda_rs=0.1, batch_size=64, gamma=0.99):
    """
    Train a robust value function for a policy under attack using Robust Sarsa.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        epsilon_schedule (list): Schedule for perturbation magnitudes.
        num_steps (int): Number of training steps.
        lambda_rs (float): Regularization parameter for the robust objective.
        batch_size (int): Number of transitions sampled per update.
        gamma (float): Discount factor.

    Returns:
        torch.nn.Module: The robust Q-value network.
    """
    device = next(policy_net.parameters()).device

    # Detect action space type
    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n  # Discrete action space
    elif isinstance(env.action_space, gym.spaces.Box):
        action_dim = env.action_space.shape[0]  # Continuous action space
    else:
        raise ValueError("Unsupported action space type. Only Discrete and Box spaces are supported.")

    # Initialize Q-function (robust critic) as a neural network
    q_net = torch.nn.Sequential(
        torch.nn.Linear(env.observation_space.shape[0] + action_dim, 128),
        torch.nn.ReLU(),
        torch.nn.Linear(128, 1)  # Single Q-value output
    ).to(device)

    optimizer = torch.optim.Adam(q_net.parameters(), lr=1e-3)

    # Replay buffer
    replay_buffer = []

    def collect_trajectory():
        """Collect one trajectory and add to the replay buffer."""
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if necessary
        state = torch.tensor(state, dtype=torch.float32).to(device)
    
        done = False
        while not done:
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, pass only the state to Q-network
                    q_values = torch.cat([q_net(torch.cat([state, torch.eye(action_dim)[a].to(device)], dim=0))
                                          for a in range(action_dim)])
                    action = torch.argmax(q_values).item()
                else:
                    # For continuous actions, extract mean from policy network
                    policy_output = policy_net(state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output  # Extract mean and ignore std
                    else:
                        mean = policy_output  # If single output, it's the mean
                    action = mean.squeeze().cpu().numpy()  # Convert to NumPy
    
            # Step the environment
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated  # Combine termination conditions
            next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
    
            replay_buffer.append((state, action, reward, next_state, done))
    
            if len(replay_buffer) > 10000:
                replay_buffer.pop(0)
    
            state = next_state

    for step in range(num_steps):
        # Collect new trajectories periodically
        if len(replay_buffer) < batch_size or step % 10 == 0:
            collect_trajectory()

        # Ensure the buffer has enough samples for a batch
        if len(replay_buffer) < batch_size:
            continue  # Skip training step until buffer has enough data

        # Sample batch
        batch = random.sample(replay_buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.stack(states).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.stack(next_states).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        # Prepare inputs for Q-network
        if isinstance(env.action_space, gym.spaces.Discrete):
            actions = torch.tensor(actions, dtype=torch.int64).to(device)  # Discrete actions
            state_action_pairs = torch.cat([states, torch.eye(action_dim).to(device)[actions]], dim=1)
            next_state_action_pairs = torch.cat([next_states, torch.eye(action_dim).to(device)], dim=1)
        else:
            actions = torch.tensor(actions, dtype=torch.float32).to(device)  # Continuous actions
            state_action_pairs = torch.cat([states, actions], dim=1)
            next_state_action_pairs = torch.cat([next_states, actions], dim=1)

        # Temporal Difference Loss
        q_values = q_net(state_action_pairs).squeeze()
        q_values_next = q_net(next_state_action_pairs).squeeze()
        td_loss = (rewards + gamma * (1 - dones) * q_values_next - q_values).pow(2).mean()

        # Robustness Loss
        epsilon = epsilon_schedule[min(step, len(epsilon_schedule) - 1)]
        robust_loss = 0
        for i in range(batch_size):
            perturbation = (torch.rand_like(states[i]) * 2 - 1) * epsilon
            perturbed_state = states[i] + perturbation
            perturbed_state_action = torch.cat([perturbed_state, actions[i]], dim=0)
            robust_loss += (q_net(perturbed_state_action.unsqueeze(0)) - q_values[i]).pow(2).mean()
        robust_loss /= batch_size

        # Total Loss
        total_loss = td_loss + lambda_rs * robust_loss

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Print progress
        if step % 100 == 0:
            print(f"Step {step}/{num_steps}, TD Loss: {td_loss.item():.4f}, Robust Loss: {robust_loss.item():.4f}")

    return q_net


In [14]:
def evaluate_agent_with_robust_sarsa_attack(env, policy_net, robust_q_net, epsilon, step_size, num_episodes=100, attack_steps=10):
    """
    Evaluate the agent under a Robust Sarsa Critic-based attack.

    Args:
        env (gym.Env): The environment.
        policy_net (torch.nn.Module): The trained policy network.
        robust_q_net (torch.nn.Module): The robust Q-value network trained with Robust Sarsa.
        epsilon (float): Maximum perturbation magnitude for the attack.
        step_size (float): Step size for the gradient update.
        num_episodes (int): Number of episodes for evaluation.
        attack_steps (int): Number of attack steps (K in the pseudocode).

    Returns:
        float: Average reward over the episodes under Robust Sarsa Critic-based attack.
    """
    device = next(policy_net.parameters()).device
    total_reward = 0

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Unpack if state is returned as (observation, info)
        state = torch.tensor(state, dtype=torch.float32).to(device)

        episode_reward = 0
        done = False

        while not done:
            # Initialize the perturbed state
            perturbed_state = state.clone().detach().requires_grad_(True)

            # Perform the attack as per Algorithm 2
            for _ in range(attack_steps):
                # Forward pass through the policy to get the action
                with torch.no_grad():
                    if isinstance(env.action_space, gym.spaces.Discrete):
                        action_probs = policy_net(perturbed_state.unsqueeze(0))
                        action = torch.argmax(action_probs, dim=-1)
                    else:
                        policy_output = policy_net(perturbed_state.unsqueeze(0))
                        if isinstance(policy_output, tuple):
                            mean, _ = policy_output  # Extract mean and ignore std
                        else:
                            mean = policy_output
                        action = mean.squeeze()

                # Compute Q(s, a) for the critic
                state_action = torch.cat([perturbed_state, action.float().to(device)]) if isinstance(env.action_space, gym.spaces.Box) else \
                               torch.cat([perturbed_state, torch.eye(env.action_space.n)[action].to(device)], dim=0)
                q_value = robust_q_net(state_action.unsqueeze(0))

                # Backpropagate the gradient
                q_value.backward()
                grad = perturbed_state.grad

                # Update the perturbed state based on the gradient and step size
                perturbed_state = perturbed_state - step_size * grad.sign()
                perturbed_state = torch.max(
                    torch.min(perturbed_state, state + epsilon), state - epsilon
                ).detach().requires_grad_(True)  # Clamp to the epsilon-ball

            # Use the adversarially perturbed state to select the final action
            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    action_probs = policy_net(perturbed_state.unsqueeze(0))
                    action = torch.argmax(action_probs, dim=-1).item()
                else:
                    policy_output = policy_net(perturbed_state.unsqueeze(0))
                    if isinstance(policy_output, tuple):
                        mean, _ = policy_output
                    else:
                        mean = policy_output
                    action = mean.squeeze().cpu().numpy()

            # Step the environment
            next_step_result = env.step(action)
            if isinstance(next_step_result, tuple):
                next_state, reward, done, _, _ = next_step_result
            else:
                next_state, reward, done = next_step_result[:3]

            # Accumulate the reward
            episode_reward += reward

            # Update the state
            state = torch.tensor(next_state, dtype=torch.float32).to(device)

        total_reward += episode_reward
        print(f"Episode {episode + 1}/{num_episodes}: Reward = {episode_reward}")

    average_reward = total_reward / num_episodes
    print(f"Average Reward under Robust Sarsa Critic-based attack: {average_reward}")
    return average_reward


In [20]:
# Example usage
env = gym.make("Walker2d-v4")

# Initialize the trained policy network
policy_net = VanillaAgent.policy_net  # Use your trained policy network here

# Parameters for MAD attack
epsilon = 0.1  # Maximum perturbation magnitude
attack_steps = 10  # Number of gradient steps
step_epsilon = 0.01  # Step size for each gradient step
epsilon_schedule = [0.01 * i for i in range(1, 101)]
# Evaluate the policy under MAD attack

robust_q_net=robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    epsilon_schedule=epsilon_schedule,
    num_steps=5000,        # Number of training steps
    lambda_rs=0.1,         # Regularization parameter for robust loss
    batch_size=64,         # Batch size for training
    gamma=0.99             # Discount factor
)


  actions = torch.tensor(actions, dtype=torch.float32).to(device)  # Continuous actions


Step 0/5000, TD Loss: 4.3310, Robust Loss: 0.0000
Step 100/5000, TD Loss: 3.3553, Robust Loss: 0.1050
Step 200/5000, TD Loss: 3.9885, Robust Loss: 0.3281
Step 300/5000, TD Loss: 3.7331, Robust Loss: 0.6373
Step 400/5000, TD Loss: 3.0604, Robust Loss: 0.7190
Step 500/5000, TD Loss: 3.2349, Robust Loss: 1.2653
Step 600/5000, TD Loss: 2.8970, Robust Loss: 1.5562
Step 700/5000, TD Loss: 3.3306, Robust Loss: 2.3945
Step 800/5000, TD Loss: 2.2871, Robust Loss: 2.0078
Step 900/5000, TD Loss: 1.6863, Robust Loss: 2.6127
Step 1000/5000, TD Loss: 2.1967, Robust Loss: 4.2389
Step 1100/5000, TD Loss: 2.3882, Robust Loss: 2.3108
Step 1200/5000, TD Loss: 1.8464, Robust Loss: 3.0430
Step 1300/5000, TD Loss: 1.8825, Robust Loss: 3.6038
Step 1400/5000, TD Loss: 1.3751, Robust Loss: 2.1003
Step 1500/5000, TD Loss: 1.3362, Robust Loss: 3.2616
Step 1600/5000, TD Loss: 1.3975, Robust Loss: 3.1535
Step 1700/5000, TD Loss: 1.7389, Robust Loss: 2.4229
Step 1800/5000, TD Loss: 1.4715, Robust Loss: 2.1467
Step 

In [21]:
average_reward = average_reward = evaluate_agent_with_robust_sarsa_attack(
    env=env,
    policy_net=policy_net,
    robust_q_net=robust_q_net,
    epsilon=0.05,
    num_episodes=200,
    step_size=0.01
)
print(f"Final Average Reward under Robust Sarsa Attack: {average_reward}")


Episode 1/200: Reward = 284.6235436201942
Episode 2/200: Reward = 278.7098898097919
Episode 3/200: Reward = 270.79058966267814
Episode 4/200: Reward = 288.45116032586617
Episode 5/200: Reward = 281.54298989280346
Episode 6/200: Reward = 276.0320334474719
Episode 7/200: Reward = 281.76446317156496
Episode 8/200: Reward = 283.25777296391726
Episode 9/200: Reward = 281.93904153187356
Episode 10/200: Reward = 281.4979143733671
Episode 11/200: Reward = 285.53657156020637
Episode 12/200: Reward = 270.6550284999157
Episode 13/200: Reward = 282.36337877680415
Episode 14/200: Reward = 285.3086459278199
Episode 15/200: Reward = 278.1046304589377
Episode 16/200: Reward = 280.58099244112935
Episode 17/200: Reward = 276.85128973894547
Episode 18/200: Reward = 282.6953145284702
Episode 19/200: Reward = 274.4318149693389
Episode 20/200: Reward = 291.04140163392174
Episode 21/200: Reward = 284.87258697260717
Episode 22/200: Reward = 283.6506600767076
Episode 23/200: Reward = 275.9215221010505
Episode 