<a href="https://colab.research.google.com/github/BingyuZhou/DRLBook/blob/master/ImitationLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
import torch.nn as nn
import numpy as np
import gym
torch.__version__

'1.6.0+cu101'

## Imitation Learning (DAGGER)

Given an expert policy ($\pi_{exp}$) as black box, training a new neural network ($\pi_\theta$) to immitate the policy distributoin of expert.

DAGGER algorithm (Dataset aggregation):
---
while training:
- train $\pi_\theta$ from expert dataset $D= \{o_1,a_1,...,o_n,a_n \}$
- run $\pi_\theta$ to get dataset $D_{\pi} = \{o_1,...,o_n \}$
- relabel $D_{\pi}$ with expert policy $D_{\pi}^{expert} = \{o_1,a_1,...,o_n,a_n \}$
- $D += D_{\pi}^{expert}$



In [2]:
class Model(nn.Module):
  def __init__(self, feature_dim, output_dim):
    super(Model, self).__init__()
    self.dense1 = nn.Linear(feature_dim, 256)
    self.dense2 = nn.Linear(120, output_dim)
  
  def forward(self, x):
    x = nn.functional.relu(self.dense1(x))
    return self.dense2(x)


class Agent():
  def __init__(self, feature_dim, output_dim):
    self.mu = Model(feature_dim, output_dim)
    log_std = -0.5*np.ones(output_dim, dtype=np.float32)
    self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) # is this trained?
    self.loss = nn.MSELoss()
    self.optimizer = torch.optim.Adam(self.model.parameters)

  def train(self, iters, batch_size, dataset):
    for i in iters:
      obs, expert_act = dataset.sample(batch_size)
      act = self.inference(obs)
      loss = self.loss(act, expert_act)

      self.optimizer.zero_grad()
      loss.backward()
      for param in self.model.parameters():
        param.grad.data.clamp(-1,1)
      self.optimizer.step()
  
  def inference(self, x):
    mean = self.mu(x)
    return mean+torch.exp(self.log_std)*torch.normal(torch.zeros_like(mean), 1)

class DataSet():
  def __init__(self, size, obs_dim, act_dim):
    self.observation = np.zeros((size, obs_dim), dtype=np.float32)
    self.action = np.zeros((size, act_dim), dtype=np.float32)

    self._pos = 0
    self._size = size
  
  def add(self, obs, act):
    self.observation[self._pos,:] = obs
    self.action[self._pos,:] =act
    self._pos = (self._pos+1) % self._size
  
  def sample(self, size):
    idxes = np.random.choice(self._pos+1, size, replace=False)
    return self.observation[idxes], self.action[idxes]



In [3]:
def collect_training_trajs(env_fn, policy, batch_size, max_ep_len):
  traj_obs = []
  traj_act = []
  env = env_fn()
  while len(traj_obs) < batch_size:
    obs = env.reset()
    step = 0
    while True:
      traj_obs.append(obs)
      act = policy(obs)
      traj_act.append(act)

      obs, rew, done = env.step(act)
      step+=1

      if done or steps>=max_ep_len:
        break
  return (traj_obs, traj_act)

def relabel_with_expert(trajs, expert):
      act = expert(trajs[0])
      trajs[1] = act
      return trajs


In [4]:
def DAgger(iters, env_fn, agent, expert, dataset, batch_size, max_ep_len):
  for i in iters:
    agent.train()

    trajs = collect_training_trajs(env_fn, agent, batch_size, max_ep_len)
    trajs = relabel_with_expert(trajs, expert)
    dataset.add(trajs)

In [None]:
expert_policy_path = ''
expert_policy = torch.load(expert_policy_path)
expert_policy.eval()

feature_dim = 3
output_dim = 3
agent = Agent(feature_dim, output_dim)
dataset = DataSet(int(1e7), feature_dim, output_dim)
env_fn = lambda gym.make("BipedalWalker-v2")
batch_size = 128
max_ep_len = 200

DAgger(10, env_fn, agent, expert_policy, dataset, batch_size, max_ep_len)

