In [1]:
import gym
import argparse
import numpy as np
np.bool8 = np.bool
import pandas as pd
import torch
import time
# import config
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torch.nn.utils import clip_grad_value_

from torch.autograd import grad as torch_grad

import h5py

# import replay_buffer, learning_rate
# from learning_rate import LearningRate

import os
os.environ['LD_LIBRARY_PATH'] = "/home/garyding/.mujoco/mujoco210/bin:/usr/lib/nvidia"
os.environ.get("LD_LIBRARY_PATH", "")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class LearningRate:
    """
    Attributes:
        lr (float)
        decay_factor (float)
        training_step (int)
    """
    __instance = None

    def __init__(self):
        if LearningRate.__instance is not None:
            raise Exception("Singleton instantiation called twice")
        else:
            LearningRate.__instance = self
            self.lr = None
            self.decay_factor = None
            self.training_step = 0

    @staticmethod
    def get_instance():
        """Get the singleton instance.

        Returns:
            (LearningRate)
        """
        if LearningRate.__instance is None:
            LearningRate()
        return LearningRate.__instance

    def set_learning_rate(self, lr):
        self.lr = lr

    def get_learning_rate(self):
        return self.lr

    def increment_step(self):
        self.training_step += 1

    def get_step(self):
        return self.training_step

    def set_decay(self, d):
        self.decay_factor = d

    def decay(self):
        if self.lr is None:
            raise ValueError("Learning rate has not been set.")
        self.lr = self.lr * self.decay_factor

In [3]:
# Simple replay buffer
class ReplayBuffer(object):
    def __init__(self, env):
        self.buffer = []  # No size limitation, similar to the paper
        self.zeroAction = np.zeros_like(env.action_space.sample(), dtype=np.float32)
        self.absorbingState = np.zeros((env.observation_space.shape[0]), dtype=np.float32)

    def __len__(self):
        return len(self.buffer)

    # data = (state, action, next_state)
    def add(self, data, done):
        if done:
            self.buffer.append((data[0], data[1], self.absorbingState, False))
        else:
            self.buffer.append((data[0], data[1], data[2], False))

    def addAbsorbing(self):
        self.buffer.append((self.absorbingState, self.zeroAction, self.absorbingState, False))

    def sample(self, batch_size=100):
        ind = np.random.randint(0, len(self.buffer), size=batch_size)
        s, a, ns, d = [], [], [], []

        for i in ind:
            S, A, nS, D = self.buffer[i]
            s.append(np.array(S))
            a.append(np.array(A))
            ns.append(np.array(nS))
            d.append(np.array(D))

        return np.array(s), np.array(a), np.array(ns), np.array(d)

In [4]:
# entropy_weight = 0.001 from openAI/imiation
class Discriminator(nn.Module):
    def __init__(self, num_inputs, hidden_size=100, lamb=10, entropy_weight=0.001):
        """

        Args:
            num_inputs:
            hidden_size:
            lamb:
            entropy_weight:
            
        """
        super(Discriminator, self).__init__()

        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        self.linear3.weight.data.mul_(0.1)
        self.linear3.bias.data.mul_(0.0)
        self.criterion = nn.BCEWithLogitsLoss()
        self.entropy_weight = entropy_weight
        self.optimizer = torch.optim.Adam(self.parameters())
        self.LAMBDA = lamb  # used in gradient penalty
        self.use_cuda = torch.cuda.is_available()
        
        self.loss = self.ce_loss

    def forward(self, x):
        # if not self.use_cuda: x = x.float()
        x = torch.tanh(self.linear1(x))
        x = torch.tanh(self.linear2(x))
        # prob = torch.sigmoid(self.linear3(x))
        # return prob
        out = self.linear3(x)
        return out

    def reward(self, x):
        out = self(x)
        probs = torch.sigmoid(out)
        return torch.log(probs + 1e-8) - torch.log(1 - probs + 1e-8)

    def adjust_adversary_learning_rate(self, lr):
        print("Setting adversary learning rate to: {}".format(lr))
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)

    def logit_bernoulli_entropy(self, logits):
        ent = (1. - torch.sigmoid(logits)) * logits - self.logsigmoid(logits)
        return ent

    def logsigmoid(self, a):
        return torch.log(torch.sigmoid(a))

    def logsigmoidminus(self, a):
        return torch.log(1 - torch.sigmoid(a))

    def ce_loss(self, pred_on_learner, pred_on_expert, expert_weights):
        """Binary cross entropy loss.
        We believe this is the loss function the authors to communicate.

        Args:
            pred_on_learner (torch.Tensor): The discriminator's prediction on the learner.
            pred_on_expert (torch.Tensor): The discriminator's prediction on the expert.
            expert_weights (torch.Tensor): The weighting to apply to the expert loss

        Returns:
            (torch.Tensor)
        """
        learner_loss = torch.log(1 - torch.sigmoid(pred_on_learner))
        expert_loss = torch.log(torch.sigmoid(pred_on_expert)) * expert_weights
        return -torch.sum(learner_loss + expert_loss)

    def learn(self, replay_buf, expert_buf, iterations, batch_size=100):
        self.adjust_adversary_learning_rate(LearningRate.get_instance().lr)

        for it in range(iterations):
            # Sample replay buffer
            x, y, u, d = replay_buf.sample(batch_size)
            state = torch.tensor(x, dtype=torch.float32, device=device)
            action = torch.tensor(y, dtype=torch.float32, device=device)
            next_state = torch.tensor(u, dtype=torch.float32, device=device)

            # Sample expert buffer
            expert_obs, expert_act, expert_weights = expert_buf.get_next_batch(batch_size)
            expert_obs = torch.tensor(expert_obs, dtype=torch.float32, device=device)
            expert_act = torch.tensor(expert_act, dtype=torch.float32, device=device)
            expert_weights = torch.tensor(expert_weights, dtype=torch.float32, device=device).view(-1, 1)

            # Predict
            state_action = torch.cat([state, action], 1).to(device)
            expert_state_action = torch.cat([expert_obs, expert_act], 1).to(device)

            fake = self(state_action)
            real = self(expert_state_action)

            # Gradient penalty for regularization.
            gradient_penalty = self._gradient_penalty(expert_state_action, state_action)

            # The main discriminator loss
            main_loss = self.loss(fake, real, expert_weights)

            self.optimizer.zero_grad()

            total_loss = main_loss + gradient_penalty

            if it == 0 or it == iterations - 1:
                print("Discr Iteration:  {:03} ---- Loss: {:.5f} | Learner Prob: {:.5f} | Expert Prob: {:.5f}".format(
                    it, total_loss.item(), torch.sigmoid(fake[0]).item(), torch.sigmoid(real[0]).item()
                ))
            total_loss.backward()
            self.optimizer.step()

    def _gradient_penalty(self, real_data, generated_data):
        """
        Compute the gradient penalty for the current update.
        """
        batch_size = real_data.size(0)
        device = real_data.device  # Get the device (CPU or GPU)

        # Calculate interpolation
        alpha = torch.rand(batch_size, 1, device=device)  # Move alpha to the same device as real_data
        interpolated = alpha * real_data + (1 - alpha) * generated_data

        # Ensure gradients are calculated for the interpolated data
        interpolated.requires_grad_(True)

        # Calculate probability of interpolated examples
        prob_interpolated = self(interpolated)  # Forward pass through the discriminator

        # Calculate gradients of probabilities with respect to examples
        gradients = torch_grad(outputs=prob_interpolated, inputs=interpolated,
                        grad_outputs=torch.ones_like(prob_interpolated),
                        create_graph=True, retain_graph=True)[0]

        # Flatten gradients to compute the L2 norm per example in the batch
        gradients = gradients.view(batch_size, -1)

        # Compute gradient penalty
        gradients_norm = gradients.norm(2, dim=1)  # L2 norm of the gradients
        gradient_penalty = ((gradients_norm - 1) ** 2).mean()  # (||grad||_2 - 1)^2

        return self.LAMBDA * gradient_penalty

In [5]:
class Actor(nn.Module):
    '''
    \pi(a|s): Given a sequence of states, return a sequence of actions
    '''
    def __init__(self, state_dim: int, action_dim: int, max_action: float):
        super().__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action

    def forward(self, x: Tensor) -> Tensor:
        device = x.device
        x = x.to(device).float()
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        x = torch.tanh(self.l3(x)) * self.max_action
        return x

    def act(self, x: Tensor) -> Tensor:
        x = torch.tensor(x, dtype=torch.float32, device=device)
        return self(x)

In [6]:
class Critic(nn.Module):
    '''
    Given a squence of (state,action) pairs, return (Q1,Q2)
    '''
    def __init__(self, state_dim: int, action_dim: int):
        super().__init__()

        # Q1 network architecture
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        # Q2 network architecture
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1)

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        # Concatenate the state (x) and action (y)
        xy = torch.cat([x, y], dim=1)

        # Q1 computation
        x1 = torch.relu(self.l1(xy))
        x1 = torch.relu(self.l2(x1))
        x1 = self.l3(x1)

        # Q2 computation
        x2 = torch.relu(self.l4(xy))
        x2 = torch.relu(self.l5(x2))
        x2 = self.l6(x2)

        return x1, x2

    def Q1(self, x: Tensor, y: Tensor) -> Tensor:
        # Compute Q1 value alone (without Q2)
        xy = torch.cat([x, y], dim=1)
        x1 = torch.relu(self.l1(xy))
        x1 = torch.relu(self.l2(x1))
        x1 = self.l3(x1)
        return x1


In [7]:
class TD3(object):
    def __init__(self, state_dim, action_dim, max_action, actor_clipping, decay_steps):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

        self.decay_steps = decay_steps
        self.actor_grad_clipping = actor_clipping
        self.max_action = max_action
        self.actor_steps = 0
        self.critic_steps = 0

    def select_action(self, state):
        state = torch.tensor(state.reshape(1, -1), dtype=torch.float32).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def adjust_critic_learning_rate(self, lr):
        print("Setting critic learning rate to: {}".format(lr))
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

    def adjust_actor_learning_rate(self, lr):
        print("Setting actor learning rate to: {}".format(lr))
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

    def reward(self, discriminator, states, actions):
        states_actions = torch.cat([states, actions], 1).to(device)
        return discriminator.reward(states_actions)

    def train(self, discriminator, replay_buf, iterations, batch_size=100, discount=0.8, tau=0.005, policy_noise=0.2,
              noise_clip=0.5, policy_freq=2):

        lr_tracker = LearningRate.get_instance()
        lr = lr_tracker.lr

        self.adjust_actor_learning_rate(lr)
        self.adjust_critic_learning_rate(lr)

        for iteration in range(iterations):
            # Sample replay buffer
            x, y, u, d = replay_buf.sample(batch_size)
            # print(x.shape, y.shape, u.shape, d.shape)
            state = torch.tensor(x, dtype=torch.float32).to(device)
            action = torch.tensor(y, dtype=torch.float32).to(device)
            next_state = torch.tensor(u, dtype=torch.float32).to(device)
            reward = self.reward(discriminator, state, action)
            
            # Select action according to policy and add clipped noise
            # Generate clipped noise
            noise = torch.randn_like(action) * policy_noise  
            noise = noise.clamp(-noise_clip, noise_clip)          

            # Add noise to the action selected by the target actor network
            # print('1: ',self.actor_target(next_state).shape)
            # print('2: ',noise.shape)
            # print('3: ',(self.actor_target(next_state)+noise).shape)
            next_action = self.actor_target(next_state) + noise
            # Clamp the action to the valid action space defined by the max action
            next_action = next_action.clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            # print(next_state.shape, next_action.shape)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            # target_Q = reward + (done * discount * target_Q).detach()
            target_Q = reward + (discount * target_Q).detach()

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            if iteration == 0 or iteration == iterations - 1:
                print("Critic Iteration: {:3} ---- Loss: {:.5f}".format(iteration, critic_loss.item()))
            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if iteration % policy_freq == 0:

                # Compute actor loss
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                if iteration == 0 or iteration == iterations - 1 or iteration == iterations - 2:
                    print("Actor Iteration:  {:3} ---- Loss: {:.5f}".format(iteration, actor_loss.item()))
                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()

                # Clip, like the paper
                clip_grad_value_(self.actor.parameters(), self.actor_grad_clipping)

                self.actor_optimizer.step()
                lr_tracker.training_step += 1
                step = lr_tracker.training_step

                if step != 0 and step % self.decay_steps == 0:
                    print("Decaying learning rate at step: {}".format(step))
                    lr_tracker.decay()

                    self.adjust_actor_learning_rate(lr_tracker.lr)
                    self.adjust_critic_learning_rate(lr_tracker.lr)

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

In [8]:
def store_results(evaluations, number_of_timesteps):
    """Store the results of a run.

    Args:
        evaluations:
        number_of_timesteps (int):

    Returns:
        None
    """

    df = pd.DataFrame.from_records(evaluations)
    number_of_trajectories = len(evaluations[0]) - 1
    columns = ["reward_{}".format(i) for i in range(number_of_trajectories)]
    columns.append("timestep")
    df.columns = columns

    timestamp = time.time()
    results_fname = 'DAC_{}_tsteps_{}_results.csv'.format( number_of_timesteps, timestamp)
    df.to_csv(str(results_fname))

In [9]:
# Runs policy for X episodes and returns average reward
def evaluate_policy(env, policy, time_step, evaluation_trajectories=6):
    """

    Args:
        env: The environment being trained on.
        policy:	The policy being evaluated
        time_step (int): The number of time steps the policy has been trained for.
        evaluation_trajectories (int): The number of trajectories on which to evaluate.

    Returns:
        (list)	- The time_step, followed by all the rewards.
    """
    rewards = []
    for _ in range(evaluation_trajectories):
        r = 0
        obs = env.reset()[0]
        done, trun = False,False
        while (not done)&(not trun):
            action = policy.select_action(np.array(obs))
            obs, reward, done, trun, _ = env.step(action)
            r += reward
        rewards.append(r)
    print("Average reward at timestep {}: {}".format(time_step, np.mean(rewards)))

    rewards.append(time_step)
    return rewards

In [10]:
env = gym.make('Pendulum-v1')

In [11]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [12]:
trajectory_length = 1000
batch_size = 1000
num_steps = 1000000

In [13]:
lr = LearningRate.get_instance()
lr.lr = 10 ** (-3)
lr.decay_factor = 0.5

In [14]:
def load_dataset(path, limit_trajs, data_subsamp_freq=1):
    tmp = torch.load(path)
    full_dset_size = tmp['state'].size(0)
    dset_size = min(full_dset_size, limit_trajs) if limit_trajs is not None else full_dset_size

    states = tmp['state'].reshape(100, full_dset_size//100, 3).clone()[:dset_size]
    actions = tmp['action'].reshape(100, full_dset_size//100, 1).clone()[:dset_size]
    rewards = tmp['reward'].reshape(100, full_dset_size//100, 1).clone()[:dset_size]
    dones = tmp['done'].reshape(100, full_dset_size//100, 1).clone()[:dset_size]
    next_states = tmp['next_state'].clone()[:dset_size]
    return states, actions, rewards

class Dset(object):
    def __init__(self, obs, acs, num_traj, absorbing_state, absorbing_action):
        self.obs = obs
        self.acs = acs
        self.num_traj = num_traj
        assert len(self.obs) == len(self.acs)
        assert self.num_traj > 0
        self.steps_per_traj = int(len(self.obs) / num_traj)

        self.absorbing_state = absorbing_state
        self.absorbing_action = absorbing_action

    def get_next_batch(self, batch_size):
        assert batch_size <= len(self.obs)
        num_samples_per_traj = int(batch_size / self.num_traj)
        # print('mul', num_samples_per_traj * self.num_traj)
        assert num_samples_per_traj * self.num_traj == batch_size
        N = self.steps_per_traj / num_samples_per_traj  # This is the importance weight for
        j = num_samples_per_traj
        num_samples_per_traj = num_samples_per_traj - 1  # make room for absorbing

        obs = None
        acs = None
        weights = [1 for i in range(batch_size)]
        while j <= batch_size:
            weights[j - 1] = 1.0 / N
            j = j + num_samples_per_traj + 1
        
        # print(self.num_traj)
        for i in range(self.num_traj):
            indicies = np.sort(
                np.random.choice(range(self.steps_per_traj * i, self.steps_per_traj * (i + 1)), num_samples_per_traj,
                                 replace=False))
            if obs is None:
                obs = np.concatenate((self.obs[indicies, :], self.absorbing_state), axis=0)
                
            else:
                obs = np.concatenate((obs, self.obs[indicies, :], self.absorbing_state), axis=0)

            if acs is None:
                acs = np.concatenate((self.acs[indicies, :], self.absorbing_action), axis=0)
            else:
                acs = np.concatenate((acs, self.acs[indicies, :], self.absorbing_action), axis=0)

        return obs, acs, weights

class Mujoco_Dset(object):
    def __init__(self, env, expert_path, traj_limitation=-1):
        obs, acs, rets = load_dataset(expert_path, traj_limitation)
        # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
        # and S is the environment observation/action space.
        # Flatten to (N * L, prod(S))
        self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
        # self.acs = acs
        self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])

        self.rets = rets.sum(axis=1)
        self.avg_ret = sum(self.rets) / len(self.rets)
        self.std_ret = np.std(np.array(self.rets))
        # if len(self.acs) > 2:
        #     self.acs = np.squeeze(self.acs)
        assert len(self.obs) == len(self.acs)
        self.num_traj = len(rets)
        self.num_transition = len(self.obs)

        absorbing_state = np.zeros((1, env.observation_space.shape[0]), dtype=np.float32)
        zero_action = np.zeros_like(env.action_space.sample(), dtype=np.float32).reshape(1, env.action_space.shape[0])
        self.dset = Dset(self.obs, self.acs, traj_limitation, absorbing_state, zero_action)
        self.log_info()

    def log_info(self):
        print("Total trajs: %d" % self.num_traj)
        print("Total transitions: %d" % self.num_transition)
        print("Average returns: %f" % self.avg_ret)
        print("Std for returns: %f" % self.std_ret)

    def get_next_batch(self, batch_size):
        return self.dset.get_next_batch(batch_size)

In [15]:
expert_buffer = Mujoco_Dset(env, 'size1000000_std0.01_prand0.0.pth', 100)
actor_replay_buffer = ReplayBuffer(env)

Total trajs: 100
Total transitions: 1000000
Average returns: -12447.127930
Std for returns: 1190.231812


  tmp = torch.load(path)


In [16]:
# TD3(state_dim, action_dim, max_action, actor_clipping, decay_steps*) *Not used yet;
td3_policy = TD3(state_dim, action_dim, max_action, 40, 10 ** 5)

# Input dim = state_dim + action_dim
discriminator = Discriminator(state_dim + action_dim).to(device)

# For storing temporary evaluations
evaluations = [evaluate_policy(env, td3_policy, 0)]
evaluate_every = 1000
steps_since_eval = 0

Average reward at timestep 0: -1610.6053088019205


In [17]:
env.reset()

(array([-0.94295955,  0.33290738,  0.7437855 ], dtype=float32), {})

In [18]:
while len(actor_replay_buffer) < num_steps:
    print("\nCurrent step: {}".format(len(actor_replay_buffer.buffer)))
    current_state = env.reset()[0]
    # Sample from policy; maybe we don't reset the environment -> since this may bias the policy toward initial observations
    for j in range(trajectory_length):
        action = td3_policy.select_action(np.array(current_state))
        next_state, reward, done, _, _ = env.step(action)

        if done:
            actor_replay_buffer.addAbsorbing()
            current_state = env.reset()
        else:
            actor_replay_buffer.add((current_state, action, next_state), done)
            # print((current_state.shape, action.shape, next_state.shape))
            current_state = next_state

    discriminator.learn(actor_replay_buffer, expert_buffer, trajectory_length, batch_size)

    td3_policy.train(discriminator, actor_replay_buffer, trajectory_length, batch_size)

    if steps_since_eval >= evaluate_every:
        steps_since_eval = 0

        evaluation = evaluate_policy(env, td3_policy, len(actor_replay_buffer))
        evaluations.append(evaluation)

    steps_since_eval += trajectory_length
    
last_evaluation = evaluate_policy(env, td3_policy, len(actor_replay_buffer))
evaluations.append(last_evaluation)

store_results(evaluations, len(actor_replay_buffer))


Current step: 0
Setting adversary learning rate to: 0.001


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Discr Iteration:  000 ---- Loss: 1325.66638 | Learner Prob: 0.49773 | Expert Prob: 0.49822
Discr Iteration:  999 ---- Loss: 159.31540 | Learner Prob: 0.40968 | Expert Prob: 0.97465
Setting actor learning rate to: 0.001
Setting critic learning rate to: 0.001
Critic Iteration:   0 ---- Loss: 24.54868
Actor Iteration:    0 ---- Loss: 0.52470
Actor Iteration:  998 ---- Loss: 3.50349
Critic Iteration: 999 ---- Loss: 0.02586

Current step: 1000
Setting adversary learning rate to: 0.001
Discr Iteration:  000 ---- Loss: 1604.98340 | Learner Prob: 0.07602 | Expert Prob: 0.97440
Discr Iteration:  999 ---- Loss: 122.26205 | Learner Prob: 0.01162 | Expert Prob: 0.98059
Setting actor learning rate to: 0.001
Setting critic learning rate to: 0.001
Critic Iteration:   0 ---- Loss: 10.65042
Actor Iteration:    0 ---- Loss: 4.52143
Actor Iteration:  998 ---- Loss: 11.95334
Critic Iteration: 999 ---- Loss: 0.00546
Average reward at timestep 2000: -1211.6257788527869

Current step: 2000
Setting adversary 

KeyboardInterrupt: 