In [1]:
import numpy as np
from stable_baselines import logger

class Dset(object):
  def __init__(self, inputs, labels, randomize):
    self.inputs = inputs
    self.labels = labels
    assert len(self.inputs) == len(self.labels)
    self.randomize = randomize
    self.num_pairs = len(inputs)
    self.init_pointer()

  def init_pointer(self):
    self.pointer = 0
    if self.randomize:
      idx = np.arange(self.num_pairs)
      np.random.shuffle(idx)
      self.inputs = self.inputs[idx, :]
      self.labels = self.labels[idx, :]

  def get_next_batch(self, batch_size):
    # if batch_size is negative -> return all
    if batch_size < 0:
      return self.inputs, self.labels
    if self.pointer + batch_size >= self.num_pairs:
      self.init_pointer()
    end = self.pointer + batch_size
    inputs = self.inputs[self.pointer:end, :]
    labels = self.labels[self.pointer:end, :]
    self.pointer = end
    return inputs, labels


class Custom_Dset(object):
  def __init__(self, expert_path, train_fraction=0.7, randomize=True):
    traj_data = np.load(expert_path)
    obs = traj_data['obs'][:]
    acs = traj_data['acs'][:]

    # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
    # and S is the environment observation/action space.
    # Flatten to (N * L, prod(S))
    if len(obs.shape) > 2:
      self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
      self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
    else:
      self.obs = np.vstack(obs)
      self.acs = np.vstack(acs)

    # self.rets = traj_data['ep_rets'][:traj_limitation]
    # self.avg_ret = sum(self.rets)/len(self.rets)
    # self.std_ret = np.std(np.array(self.rets))
    if len(self.acs) > 2:
      self.acs = np.squeeze(self.acs)
    assert len(self.obs) == len(self.acs)
    # self.num_traj = min(traj_limitation, len(traj_data['obs']))
    self.num_transition = len(self.obs)
    self.randomize = randomize
    self.dset = Dset(self.obs, self.acs, self.randomize)
    # for behavior cloning
    self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
                          self.acs[:int(self.num_transition *
                                        train_fraction), :],
                          self.randomize)
    self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
                        self.acs[int(self.num_transition*train_fraction):, :],
                        self.randomize)
    self.log_info()

  def log_info(self):
    # logger.log("Total trajectories: %d" % self.num_traj)
    logger.log("Total transitions: %d" % self.num_transition)
  #   logger.log("Average returns: %f" % self.avg_ret)
  #   logger.log("Std for returns: %f" % self.std_ret)

  def get_next_batch(self, batch_size, split=None):
    if split is None:
      return self.dset.get_next_batch(batch_size)
    elif split == 'train':
      return self.train_set.get_next_batch(batch_size)
    elif split == 'val':
      return self.val_set.get_next_batch(batch_size)
    else:
      raise NotImplementedError

  def plot(self):
    import matplotlib.pyplot as plt
    plt.hist(self.rets)
    plt.savefig("histogram_rets.png")
    plt.close()


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


In [2]:
import gym
from gym import spaces
import numpy as np

class CustomEnv(gym.Env):
  metadata = {"render_modes": [None, "human"], "render_fps": 10}

  def __init__(self):
    super().__init__()
    self.observation_space = spaces.Box(-np.inf, np.inf, shape=(45,), dtype=np.float32)
    self.action_space = spaces.Box(-np.inf, np.inf, shape=(7,), dtype=np.float32)

    self.render_mode = None

  # def reset(self):
  #   # We need the following line to seed self.np_random
  #   # super().reset()

  #   # Choose the agent's location uniformly at random
  #   self._state = np.random.rand(45).astype('f')

  #   return self._state

  # def _get_info(self):
  #   return None
  
  # def _get_obs(self):
  #   return self._state

  # def render(self):
  #   pass

  # def _render_frame(self):
  #   pass

  # def step(self, action):
  #   self._state[-6:] = self._state[-6:] + action
  #   # An episode is done iff the agent has reached the target

  #   return self._state, 1 - np.linalg.norm(action, ord=2), np.linalg.norm(action, ord=2) == 0, {}


In [3]:
import tempfile
import os.path as osp
import gym
import logging
from tqdm import tqdm

import tensorflow as tf

from stable_baselines.common.policies import MlpPolicy
from stable_baselines import logger
from stable_baselines.common import tf_util as U
from stable_baselines.common.mpi_adam import MpiAdam
# from baselines.gail.run_mujoco import runne

def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
          adam_epsilon=1e-5, optim_stepsize=3e-4,
          ckpt_dir=None, task_name=None,
          verbose=False):

  val_per_iter = int(max_iters/10)
  ob_space = env.observation_space
  ac_space = env.action_space
  # Construct network for new policy
  pi = policy_func("pi", ob_space, ac_space)
  # placeholder
  # ob = U.get_placeholder_cached(name="ob")
  ob = tf.placeholder(tf.float32, (None, 45), name="pi/ob")
  ac = pi.pdtype.sample_placeholder([None])
  # stochastic = U.get_placeholder_cached(name="stochastic")
  stochastic = tf.placeholder(tf.bool, (), name="pi/stochastic")
  loss = tf.reduce_mean(tf.square(ac-pi.ac))
  var_list = pi.get_trainable_variables()
  adam = MpiAdam(var_list, epsilon=adam_epsilon)
  lossandgrad = U.function([ob, ac, stochastic], [
                           loss]+[U.flatgrad(loss, var_list)])

  U.initialize()
  adam.sync()
  logger.log("Pretraining with Behavior Cloning...")
  for iter_so_far in tqdm(range(int(max_iters))):
    ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
    train_loss, g = lossandgrad(ob_expert, ac_expert, True)
    adam.update(g, optim_stepsize)
    if verbose and iter_so_far % val_per_iter == 0:
      ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
      val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
      logger.log("Training loss: {}, Validation loss: {}".format(
          train_loss, val_loss))

  if ckpt_dir is None:
    savedir_fname = tempfile.TemporaryDirectory().name
  else:
    savedir_fname = osp.join(ckpt_dir, task_name)
  U.save_variables(savedir_fname, variables=pi.get_variables())
  return savedir_fname, pi


def get_task_name():
  task_name = 'BC'
  task_name += '.{}'.format('custom')
  task_name += '.traj_limitation_{}'.format(-1)
  task_name += ".seed_{}".format(0)
  return task_name


def make_session(expert_path):
  U.make_session(num_cpu=1).__enter__()
  # set_global_seeds(0)
  env = CustomEnv()

  def policy_fn(name, ob_space, ac_space, reuse=False):
    temp = MlpPolicy(tf.compat.v1.get_default_session(), ob_space=ob_space, ac_space=ac_space, n_env=1, n_steps=1e3, n_batch=128,
                                reuse=reuse)
    temp.action
    return temp
  # env = bench.Monitor(env, logger.get_dir() and
  #                     osp.join(logger.get_dir(), "monitor.json"))
  env.seed(0)

  gym.logger.setLevel(logging.ERROR)
  task_name = get_task_name()

  dataset = Custom_Dset(expert_path=expert_path)

  savedir_fname, pi = learn(env,
                        policy_fn,
                        dataset,
                        max_iters=1e3,
                        ckpt_dir='checkpoint',
                        # log_dir=osp.join('log', task_name),
                        task_name=task_name,
                        verbose=True)
  return savedir_fname, pi
  avg_len, avg_ret = runner(env,
                            policy_fn,
                            savedir_fname,
                            timesteps_per_batch=1024,
                            number_trajs=10,
                            stochastic_policy=False,
                            save=False,
                            reuse=True)

  return avg_len, avg_ret


In [4]:
# tf.constant(pi, shape=(None, 45), dtype=np.float32)
tf.placeholder(tf.float32, (None, 45), name="ob")




<tf.Tensor 'ob:0' shape=(?, 45) dtype=float32>

In [5]:
import pandas as pd
from ast import literal_eval
import pytz

data = pd.read_csv('data/data.csv')
data['datetime'] = pd.to_datetime(
    data['timestamp'], unit='ms', utc=True).dt.tz_convert(pytz.timezone('US/Mountain'))
data.drop(['timestamp', 'scene name', 'Unnamed: 19'], axis=1, inplace=True)
data.set_index(['datetime'], inplace=True)
temp = ['rIndex position', 'rIndex rotation', 'rIndex velocity',
        'rIndex angular velocity', 'lIndex position', 'lIndex rotation',
        'lIndex velocity', 'lIndex angular velocity',
        'gaze origin', 'gaze direction',
        'head movement direction', 'head velocity', 'target velocity',
        'target angular velocity', 'target position', 'target rotation']

data[temp] = data[temp].applymap(literal_eval, na_action='ignore')
dataset = pd.DataFrame()
dataset['Is Eye Tracking Enabled and Valid'] = data['Is Eye Tracking Enabled and Valid'].resample('0.1S').mean().interpolate('time', limit_direction='both', limit=len(data['Is Eye Tracking Enabled and Valid'].index))
for col in temp:
  col_data = data[col]
  dataset[f'{col}_x'] = col_data.apply(lambda x: x if not x == x else x[0]).resample('0.1S').mean().interpolate('time', limit_direction='both', limit=len(col_data.index))
  dataset[f'{col}_y'] = col_data.apply(lambda x: x if not x == x else x[1]).resample('0.1S').mean().interpolate('time', limit_direction='both', limit=len(col_data.index))
  dataset[f'{col}_z'] = col_data.apply(lambda x: x if not x == x else x[2]).resample('0.1S').mean().interpolate('time', limit_direction='both', limit=len(col_data.index))
  if col.endswith('rotation'):
    dataset[f'{col}_w'] = col_data.apply(lambda x: x if not x == x else x[3]).resample('0.1S').mean().interpolate('time', limit_direction='both', limit=len(col_data.index))


events = pd.read_csv('data/events.csv', usecols=['timestamp', 'event'])
events['datetime'] = pd.to_datetime(
    events['timestamp'], unit='ms', utc=True).dt.tz_convert(pytz.timezone('US/Mountain'))
events.set_index(['datetime'], inplace=True)
collision_events = events[events['event'] == 'Left IndexTip']
target_events = events[events['event'].str.match(r'^target')]
target_events.tail()
del events

target_found = target_events[target_events['event'] == 'target_found']
target_lost = target_events[target_events['event'] == 'target_lost']

final_res = []
for found, row in target_found.iterrows():
  lost = target_lost[target_lost.index > found].iloc[0].name
  mask = ((dataset.index >= found) & (dataset.index <= lost))
  obs = dataset[mask].iloc[:-1, 0:-7]
  acs = dataset[mask].iloc[1:, -7:]
  final_res.append({'obs':obs, 'acs':acs})


In [6]:
concat_obs = None
concat_acs = None
for i in final_res:
  if concat_obs is None:
    concat_obs = i['obs']
    concat_acs = i['acs']
    continue
  concat_obs = np.concatenate([concat_obs, i['obs']])
  concat_acs = np.concatenate([concat_acs, i['acs']])

np.savez('test.npz', obs=concat_obs, acs=concat_acs)

In [7]:
_, pi = make_session('test.npz')



Total transitions: 19

Instructions for updating:
Use keras.layers.flatten instead.





AttributeError: 'MlpPolicy' object has no attribute 'ac'

In [None]:
pi.get_variables()

In [None]:
from tf2onnx import tf_loader

tf_loader.from_checkpoint('checkpoint/BC.custom.traj_limitation_-1.seed_0', 'input:0', 'output:0')