# Deep RL Assignment 1: Imitation Learning
## Prepare

In [1]:
# Import dependencies
import sys,os
sys.path.append(os.path.abspath('..'))
import tensorflow as tf
import numpy as np
import tf_util
import gym

home_work_root=os.path.abspath('..')


def collect_expert_data(env_name, max_timesteps=None, num_rollouts=20, render=False):
    from load_policy import load_policy
    policy_fn = load_policy(os.path.join(home_work_root, 'experts', env_name+'.pkl'))
    
    with tf.Session():
        tf_util.initialize()
        env = gym.make(env_name)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 10 == 0:
                    print("\rIter{0}/{1}: {2}/{3}".format(i+1, num_rollouts, steps+1, max_steps), end='')
                if steps >= max_steps:
                    break
            returns.append(totalr)
        print('. done')

        #print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
    return {'observation': observations, 'actions': actions}

def test_policy(env_name, policy, max_timesteps=None, num_rollouts=20, render=False):
    return None

In [11]:
env_names=['Ant-v1', 'Reacher-v1', 'Hopper-v1']
expert_data={env: collect_expert_data(env, num_rollouts=100) for env in env_names}
import pickle
with open('expert_data.pkl', 'wb') as f:
    pickle.dump(expert_data, f)

[2017-07-06 09:32:33,796] Making new env: Ant-v1


Iter100/100: 1001/1000. done
mean return 4783.75190287
std of return 288.136414554


[2017-07-06 09:34:40,411] Making new env: Reacher-v1


Iter100/100: 51/50. done
mean return -3.77229226691
std of return 1.78029449976


[2017-07-06 09:34:48,156] Making new env: Hopper-v1


Iter100/100: 1001/1000. done
mean return 3778.26273809
std of return 3.45802476193


In [12]:
import pickle
expert_data={}
with open('expert_data.pkl', 'rb') as f:
    loaded=pickle.load(f)
    if loaded:
        expert_data.update(loaded)

In [55]:
from keras.models  import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization

def build_model(env_name):
    env = gym.make(env_name)
    assert len(env.action_space.shape) == 1
    #actions = keras.placeholder(shape=env.action_space.shape, dtype=tf.float32)
    #obs = tf.placeholder(shape=env.observation_space.shape, dtype=tf.float32)
    
    from functools import reduce
    n_obs = reduce((lambda x, y: x * y), env.observation_space.shape)
    
    model = Sequential()
    if len(env.observation_space.shape) > 1:
        model.add(Flatten(input_shape=env.observation_space.shape))
    model.add(BatchNormalization(input_shape=(n_obs,)))
    model.add(Dense(n_obs*8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(n_obs*4, activation='relu'))
    #model.add(Dense(env.action_space.shape[0]*4, activation='relu'))
    #model.add(Dropout(0.25))
    model.add(Dense(n_obs*4, activation='relu'))
    #model.add(Dropout(0.25))
    model.add(Dense(n_obs, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(env.action_space.shape[0]*2, activation='relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(env.action_space.shape[0], activation='linear'))
    
    return model, env

def train_model(model, env, x, y, batch_size=20, epochs=20):
    print(model.summary())
    model.compile(loss='mean_squared_error',
                  optimizer='sgd',
                  metrics=['accuracy'])
    from keras.callbacks import TensorBoard
    from datetime import datetime
    tensorboard=TensorBoard(log_dir='./logs/'+env.spec.id+'_'+datetime.now().strftime("%Y%m%d%H%M%S"),
                            histogram_freq=0, write_graph=True, write_images=True)
    model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=2,
              callbacks=[tensorboard])

def imitation(env_name, batch_size=20, epochs=20):
    observations= expert_data[env_name]['observation']
    actions = expert_data[env_name]['actions']
    #obs_, actions_, cost, env = build_model(env_name)
    #train_model(policy, env, feed_dict={obs_: obs, actions_: actions})
    model, env = build_model(env_name)
    train_model(model, env, x=np.array(observations), y=np.squeeze(actions), batch_size=batch_size, epochs=epochs)
    
    return model

def test_model(env_name, model,max_timesteps=None, num_rollouts=20, render=False):
    env = gym.make(env_name)
    max_steps = max_timesteps or env.spec.timestep_limit

    returns = []
    observations = []
    actions = []
    for i in range(num_rollouts):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = model.predict(obs[np.newaxis])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if render:
                env.render()
            if steps % 10 == 0:
                print("\rIter{0}/{1}: {2}/{3}".format(i+1, num_rollouts, steps+1, max_steps), end='')
            if steps >= max_steps:
                break
        returns.append(totalr)
    if render:
        env.render(close=True)
    print('. done')

    #print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

In [56]:
model = imitation('Ant-v1', batch_size=32, epochs=30)

[2017-07-06 17:01:44,476] Making new env: Ant-v1


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_19 (Batc (None, 111)               444       
_________________________________________________________________
dense_65 (Dense)             (None, 888)               99456     
_________________________________________________________________
dropout_31 (Dropout)         (None, 888)               0         
_________________________________________________________________
dense_66 (Dense)             (None, 444)               394716    
_________________________________________________________________
dense_67 (Dense)             (None, 444)               197580    
_________________________________________________________________
dense_68 (Dense)             (None, 111)               49395     
_________________________________________________________________
dropout_32 (Dropout)         (None, 111)               0         
__________

In [33]:
test_model('Ant-v1', model, num_rollouts=20, render=False)

[2017-07-06 13:35:02,827] Making new env: Ant-v1


Iter20/20: 1001/1000. done
mean return 3344.86546998
std of return 2076.13817039


In [41]:
env.spec.id

'Ant-v1'