# Deep RL Assignment 1: Imitation Learning

__[Starter Code](https://github.com/berkeleydeeprlcourse/homework/tree/master/hw1)__

### Imports

In [38]:
import os
import gym
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import tf_util
import load_policy
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.utils import shuffle

## Generate roll-outs

In [39]:
roll_outs = ["Ant-v2", "HalfCheetah-v2", "Hopper-v2", "Humanoid-v2", "Reacher-v2", "Walker2d-v2"]
#roll_outs = ["Hopper-v2"]
num_rollouts = 20;
render = False;

In [40]:

for roll_out in roll_outs:
    expert = "experts/" + roll_out + ".pkl"
    print("loading and building expert policy for " + roll_out)
    policy_fn = load_policy.load_policy(expert)
    print(roll_out + " loaded and built")

    with tf.Session():
        tf_util.initialize()

        env = gym.make(roll_out)
        # max_steps = env.spec.timestep_limit
        max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

        returns = []
        observations = []
        actions = []
        step_count = []
        
        for i in range(num_rollouts):
            # print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                # if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)
            step_count.append(steps)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions),
                       'returns': np.array(returns),
                       'steps': np.array(step_count)
                      }
        
# if the expert_data directory doesn't exist, make it
output_dir = 'expert_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)   
    
# store the roll outs in expert_data folder    
for roll_out in roll_outs:
        with open(os.path.join('expert_data', roll_out + '.pkl'), 'wb') as f:
            pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
            
print("Roll-outs generated and stored in data folder.") 

loading and building expert policy for Ant-v2
obs (1, 111) (1, 111)
Ant-v2 loaded and built
returns [4770.287577910028, 4835.357651046034, 4825.587035960754, 4852.304003274105, 4784.8834799021515, 4552.636252745802, 4585.849507322293, 2355.5772964509724, 4664.178043039128, 4710.926928526234, 4668.690227514381, 4812.332008383825, 4766.633324025868, 4893.065891524669, 4834.5992607913495, 4683.787742630899, 2461.9049442401874, 4839.247704650443, 4666.345814550789, 4627.270886627383]
mean return 4509.573279055865
std of return 706.5844080982052
loading and building expert policy for HalfCheetah-v2
obs (1, 17) (1, 17)
HalfCheetah-v2 loaded and built
returns [4200.850132120267, 4129.352976808337, 4095.6153395645674, 4145.425097176482, 4193.860396908735, 4144.751966443285, 4115.864566071722, 4095.4784714263565, 4064.6759552449525, 4197.990465205169, 4134.952769848915, 4210.691976966783, 4041.9232784904316, 4211.146618891761, 4099.566715948844, 4113.147764701846, 4097.108485208171, 4172.104929

## Simple Sequential Model

In [41]:
for roll_out in roll_outs:
    # load policy data
    data = pickle.load(open('expert_data/'+ roll_out + '.pkl', 'rb'))
    print('---------------------------------------------------------------------------------------------------------')
    print(roll_out + ' Policy Data Loaded')
    print('Creating model for... ' + roll_out)
    
    
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.shape[0]

    # set up sequential model
    model = Sequential()
    model.add(Dense(64, input_shape=(input_dim,), activation='relu'))
    model.add(Dense(3, activation='relu'))
    model.add(Dense(output_dim))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    # randomize the data

    x, y = shuffle(data['observations'], data['actions'].reshape(-1, output_dim))

    print('---------------------------------------------------------------------------------------------------------')    
    print('Fitting model for... ' + roll_out)    
    # Train the model, iterating on the data in batches of 32 samples
    model.fit(x, y, validation_split=0.1, epochs=50, batch_size=32, verbose=2)

    print('---------------------------------------------------------------------------------------------------------') 
    print('Running model for... ' + roll_out)    
    # run the model and save for comparison
    observations = []
    actions = []
    returns = []
    step_count = []
    
    for i in range(num_rollouts):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = model.predict(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if render:
                env.render()
                # if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)
        step_count.append(steps)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

    predict_data = {'observations': np.array(observations),
                    'actions': np.array(actions),
                    'returns': np.array(returns),
                    'steps': np.array(step_count)
                   }
        
    # if the predict_data directory doesn't exist, make it
    output_dir = 'predict_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)   

    # store the roll outs in expert_data folder    
    for roll_out in roll_outs:
            with open(os.path.join('predict_data', roll_out + '.pkl'), 'wb') as f:
                pickle.dump(predict_data, f, pickle.HIGHEST_PROTOCOL)

    print('---------------------------------------------------------------------------------------------------------')
    print('Comparing ' + roll_out + ' to expert.') 
    
    expert = pickle.load(open('expert_data/' + roll_out + '.pkl', 'rb'))
    predict =pickle.load(open('predict_data/' + roll_out + '.pkl', 'rb'))
    
    expert_stats = pd.Series({
        'mean reward': data['returns'].mean(),
        'std reward': data['returns'].std(),
        '% full rollout': (data['steps']/data['steps'].max()).mean()
    })
    predict_stats = pd.Series({
        'mean reward': predict_data['returns'].mean(),
        'std reward': predict_data['returns'].std(),
        '% full rollout': (predict_data['steps']/predict_data['steps'].max()).mean()
    })
    
    df = pd.DataFrame({
        'expert': expert_stats,
        'predict': predict_stats
    })
    
    print('Summary statistics for: ' + roll_out)
    print(df)
    

---------------------------------------------------------------------------------------------------------
Ant-v2 Policy Data Loaded
Creating model for... Ant-v2
---------------------------------------------------------------------------------------------------------
Fitting model for... Ant-v2
Train on 18000 samples, validate on 2000 samples
Epoch 1/50
 - 2s - loss: 0.9867 - mean_absolute_error: 0.7781 - val_loss: 0.5428 - val_mean_absolute_error: 0.5749
Epoch 2/50
 - 1s - loss: 0.4280 - mean_absolute_error: 0.5099 - val_loss: 0.3538 - val_mean_absolute_error: 0.4632
Epoch 3/50
 - 1s - loss: 0.3206 - mean_absolute_error: 0.4433 - val_loss: 0.2896 - val_mean_absolute_error: 0.4214
Epoch 4/50
 - 2s - loss: 0.2713 - mean_absolute_error: 0.4088 - val_loss: 0.2529 - val_mean_absolute_error: 0.3947
Epoch 5/50
 - 1s - loss: 0.2467 - mean_absolute_error: 0.3901 - val_loss: 0.2373 - val_mean_absolute_error: 0.3808
Epoch 6/50
 - 1s - loss: 0.2315 - mean_absolute_error: 0.3783 - val_loss: 0.2189 

Epoch 8/50
 - 1s - loss: 0.5273 - mean_absolute_error: 0.5384 - val_loss: 0.5176 - val_mean_absolute_error: 0.5338
Epoch 9/50
 - 1s - loss: 0.5242 - mean_absolute_error: 0.5360 - val_loss: 0.5154 - val_mean_absolute_error: 0.5316
Epoch 10/50
 - 1s - loss: 0.5219 - mean_absolute_error: 0.5354 - val_loss: 0.5126 - val_mean_absolute_error: 0.5308
Epoch 11/50
 - 1s - loss: 0.5189 - mean_absolute_error: 0.5337 - val_loss: 0.5109 - val_mean_absolute_error: 0.5300
Epoch 12/50
 - 1s - loss: 0.5169 - mean_absolute_error: 0.5328 - val_loss: 0.5096 - val_mean_absolute_error: 0.5292
Epoch 13/50
 - 1s - loss: 0.5155 - mean_absolute_error: 0.5320 - val_loss: 0.5084 - val_mean_absolute_error: 0.5295
Epoch 14/50
 - 1s - loss: 0.5140 - mean_absolute_error: 0.5311 - val_loss: 0.5080 - val_mean_absolute_error: 0.5300
Epoch 15/50
 - 1s - loss: 0.5133 - mean_absolute_error: 0.5308 - val_loss: 0.5058 - val_mean_absolute_error: 0.5266
Epoch 16/50
 - 1s - loss: 0.5126 - mean_absolute_error: 0.5303 - val_loss:

Epoch 18/50
 - 1s - loss: 0.1870 - mean_absolute_error: 0.3430 - val_loss: 0.1887 - val_mean_absolute_error: 0.3453
Epoch 19/50
 - 1s - loss: 0.1862 - mean_absolute_error: 0.3425 - val_loss: 0.1875 - val_mean_absolute_error: 0.3446
Epoch 20/50
 - 1s - loss: 0.1860 - mean_absolute_error: 0.3422 - val_loss: 0.1847 - val_mean_absolute_error: 0.3407
Epoch 21/50
 - 1s - loss: 0.1851 - mean_absolute_error: 0.3413 - val_loss: 0.1844 - val_mean_absolute_error: 0.3421
Epoch 22/50
 - 1s - loss: 0.1847 - mean_absolute_error: 0.3412 - val_loss: 0.1836 - val_mean_absolute_error: 0.3412
Epoch 23/50
 - 1s - loss: 0.1843 - mean_absolute_error: 0.3408 - val_loss: 0.1841 - val_mean_absolute_error: 0.3412
Epoch 24/50
 - 1s - loss: 0.1841 - mean_absolute_error: 0.3407 - val_loss: 0.1836 - val_mean_absolute_error: 0.3412
Epoch 25/50
 - 1s - loss: 0.1836 - mean_absolute_error: 0.3401 - val_loss: 0.1832 - val_mean_absolute_error: 0.3397
Epoch 26/50
 - 1s - loss: 0.1833 - mean_absolute_error: 0.3399 - val_los

Epoch 28/50
 - 1s - loss: 0.1844 - mean_absolute_error: 0.3398 - val_loss: 0.1887 - val_mean_absolute_error: 0.3426
Epoch 29/50
 - 1s - loss: 0.1844 - mean_absolute_error: 0.3398 - val_loss: 0.1910 - val_mean_absolute_error: 0.3430
Epoch 30/50
 - 1s - loss: 0.1843 - mean_absolute_error: 0.3397 - val_loss: 0.1879 - val_mean_absolute_error: 0.3415
Epoch 31/50
 - 1s - loss: 0.1840 - mean_absolute_error: 0.3393 - val_loss: 0.1897 - val_mean_absolute_error: 0.3442
Epoch 32/50
 - 1s - loss: 0.1840 - mean_absolute_error: 0.3395 - val_loss: 0.1883 - val_mean_absolute_error: 0.3420
Epoch 33/50
 - 2s - loss: 0.1835 - mean_absolute_error: 0.3391 - val_loss: 0.1889 - val_mean_absolute_error: 0.3412
Epoch 34/50
 - 1s - loss: 0.1836 - mean_absolute_error: 0.3390 - val_loss: 0.1890 - val_mean_absolute_error: 0.3434
Epoch 35/50
 - 1s - loss: 0.1834 - mean_absolute_error: 0.3391 - val_loss: 0.1885 - val_mean_absolute_error: 0.3409
Epoch 36/50
 - 1s - loss: 0.1829 - mean_absolute_error: 0.3386 - val_los

Epoch 38/50
 - 1s - loss: 0.1811 - mean_absolute_error: 0.3377 - val_loss: 0.1806 - val_mean_absolute_error: 0.3375
Epoch 39/50
 - 1s - loss: 0.1809 - mean_absolute_error: 0.3376 - val_loss: 0.1799 - val_mean_absolute_error: 0.3368
Epoch 40/50
 - 1s - loss: 0.1808 - mean_absolute_error: 0.3376 - val_loss: 0.1804 - val_mean_absolute_error: 0.3370
Epoch 41/50
 - 1s - loss: 0.1806 - mean_absolute_error: 0.3375 - val_loss: 0.1802 - val_mean_absolute_error: 0.3372
Epoch 42/50
 - 1s - loss: 0.1805 - mean_absolute_error: 0.3373 - val_loss: 0.1802 - val_mean_absolute_error: 0.3379
Epoch 43/50
 - 1s - loss: 0.1803 - mean_absolute_error: 0.3368 - val_loss: 0.1815 - val_mean_absolute_error: 0.3377
Epoch 44/50
 - 1s - loss: 0.1801 - mean_absolute_error: 0.3369 - val_loss: 0.1802 - val_mean_absolute_error: 0.3374
Epoch 45/50
 - 1s - loss: 0.1803 - mean_absolute_error: 0.3373 - val_loss: 0.1796 - val_mean_absolute_error: 0.3362
Epoch 46/50
 - 1s - loss: 0.1800 - mean_absolute_error: 0.3367 - val_los

Epoch 48/50
 - 1s - loss: 0.1801 - mean_absolute_error: 0.3366 - val_loss: 0.1794 - val_mean_absolute_error: 0.3361
Epoch 49/50
 - 1s - loss: 0.1801 - mean_absolute_error: 0.3367 - val_loss: 0.1803 - val_mean_absolute_error: 0.3369
Epoch 50/50
 - 1s - loss: 0.1801 - mean_absolute_error: 0.3366 - val_loss: 0.1787 - val_mean_absolute_error: 0.3359
---------------------------------------------------------------------------------------------------------
Running model for... Walker2d-v2
returns [495.9823004293302, 1032.8941219310245, 1037.366490633943, 1037.46990671696, 1035.2419977414654, 1034.2657985363358, 1040.3339929588958, 1035.2732704691675, 463.78894321124, 1041.8043697176386, 1036.2180229548624, 1034.3206955629996, 471.08206619439034, 1033.3647602122662, 489.18579505689564, 1034.1038341740057, 1035.1245480571554, 1039.2934878936978, 1045.9803786331033, 1032.1634804146797]
mean return 925.2629130750029
std of return 222.72608897730007
------------------------------------------------