# Deep RL Assignment 1: Imitation Learning
## Prepare

In [None]:
# Import dependencies
import sys,os
sys.path.append(os.path.abspath('..'))
import tensorflow as tf
import numpy as np
import tf_util
import gym

home_work_root=os.path.abspath('..')


def collect_expert_data(env_name, max_timesteps=None, num_rollouts=20, render=False):
    from load_policy import load_policy
    policy_fn = load_policy(os.path.join(home_work_root, 'experts', env_name+'.pkl'))
    
    with tf.Session():
        tf_util.initialize()
        env = gym.make(env_name)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 10 == 0:
                    print("\rIter{0}/{1}: {2}/{3}".format(i+1, num_rollouts, steps+1, max_steps), end='')
                if steps >= max_steps:
                    break
            returns.append(totalr)
        print('. done')

        #print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
    return {'observation': observations, 'actions': actions}

def test_policy(env_name, policy, max_timesteps=None, num_rollouts=20, render=False):
    return None

In [None]:
env_names=['Humanoid-v1',]
expert_data={env: collect_expert_data(env, num_rollouts=200) for env in env_names}

In [None]:
from keras.models  import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization

def build_model(env_name):
    env = gym.make(env_name)
    assert len(env.action_space.shape) == 1
    #actions = keras.placeholder(shape=env.action_space.shape, dtype=tf.float32)
    #obs = tf.placeholder(shape=env.observation_space.shape, dtype=tf.float32)
    
    from functools import reduce
    n_obs = reduce((lambda x, y: x * y), env.observation_space.shape)
    
    model = Sequential()
    if len(env.observation_space.shape) > 1:
        model.add(Flatten(input_shape=env.observation_space.shape))
    model.add(BatchNormalization(input_shape=(n_obs,)))
    #model.add(Dense(n_obs*8, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(n_obs*4, activation='relu'))
    model.add(Dropout(0.25))
    #model.add(Dense(n_obs, activation='relu'))
    #model.add(Dropout(0.25))
    model.add(Dense(env.action_space.shape[0]*2, activation='relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(env.action_space.shape[0], activation='linear'))
    
    return model, env

def train_model(model, env, x, y):
    print(model.summary())
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x, y, batch_size=128, epochs=20, verbose=2)

for env_name in expert_data:
    observations= expert_data[env_name]['observation']
    actions = expert_data[env_name]['actions']
    #obs_, actions_, cost, env = build_model(env_name)
    #train_model(policy, env, feed_dict={obs_: obs, actions_: actions})
    model, env = build_model(env_name)
    train_model(model, env, x=np.array(observations), y=np.squeeze(actions))

In [None]:
def test_model(env, model,max_timesteps=None, num_rollouts=20, render=False):
    max_steps = max_timesteps or env.spec.timestep_limit

    returns = []
    observations = []
    actions = []
    for i in range(num_rollouts):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            print(obs[np.newaxis].shape)
            action = model.predict(obs[np.newaxis])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if render:
                env.render()
            if steps % 10 == 0:
                print("\rIter{0}/{1}: {2}/{3}".format(i+1, num_rollouts, steps+1, max_steps), end='')
            if steps >= max_steps:
                break
        returns.append(totalr)
    env.render(close=True)
    print('. done')

    #print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

test_model(env, model, num_rollouts=30, render=True)