# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [1]:
import numpy as np
import os
import tensorflow as tf

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

  return f(*args, **kwds)


### Hyperparameters

In [2]:
### General parameters
max_steps = 5e5 # Set maximum number of steps to run environment.
run_path = "ppo" # The sub-directory name for model and summary statistics
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.
summary_freq = 500 # Frequency at which to save training statistics.
save_freq = 2000 # Frequency at which to save model.
env_name = "wallarea" # Name of the training environment file.
curriculum_file = "curricula/wall.json"

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 5000 #2048 # How large the experience buffer should be before gradient descent.
learning_rate = 3e-4 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 512 #64 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [3]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)
print(str(env))
brain_name = env.external_brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		min_wall_height -> 5.5
		max_wall_height -> 6.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 16
        Action space type: discrete
        Action space size (per agent): 6
        Memory space size (per agent): 0
        Action descriptions: , , , , , 


### Train the Agent(s)

In [4]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

model_path = './models/{}'.format(run_path)
summary_path = './summaries/{}'.format(run_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(model_path, env_name)

Step: 500. Mean Reward: -1.1209691629955947. Std of Reward: 0.5057771391310419.
Step: 1000. Mean Reward: -1.1973913043478261. Std of Reward: 0.41598382484548757.
Step: 1500. Mean Reward: -1.1871041666666668. Std of Reward: 0.4508291546132588.
Step: 2000. Mean Reward: -1.0796171171171174. Std of Reward: 0.64438738438027.
Saved Model
Step: 2500. Mean Reward: -0.9901081081081085. Std of Reward: 0.736164816438062.
Step: 3000. Mean Reward: -0.956029411764706. Std of Reward: 0.7628448555537043.
Step: 3500. Mean Reward: -1.0393279569892475. Std of Reward: 0.6988829066269944.
Step: 4000. Mean Reward: -0.9672110552763822. Std of Reward: 0.7820478758226928.
Saved Model
Step: 4500. Mean Reward: -1.049004854368932. Std of Reward: 0.7043234066403613.
Step: 5000. Mean Reward: -1.0202525252525254. Std of Reward: 0.696692356252959.
Step: 5500. Mean Reward: -0.8336285714285717. Std of Reward: 0.8519264584529845.
Step: 6000. Mean Reward: -0.8308641975308644. Std of Reward: 0.8584217107049084.
Saved Mode

Step: 49000. Mean Reward: 0.4096069868995631. Std of Reward: 0.6784855490727332.
Step: 49500. Mean Reward: 0.4484177215189871. Std of Reward: 0.645522656933293.
Step: 50000. Mean Reward: 0.2907142857142855. Std of Reward: 0.6642757747516982.
Saved Model
Step: 50500. Mean Reward: 0.4444930875576035. Std of Reward: 0.6396119367833283.
Step: 51000. Mean Reward: 0.3630232558139533. Std of Reward: 0.7218101756637442.
Step: 51500. Mean Reward: 0.3967324561403507. Std of Reward: 0.6805805399824031.
Step: 52000. Mean Reward: 0.3532157676348546. Std of Reward: 0.7429076350894526.
Saved Model
Step: 52500. Mean Reward: 0.27647058823529386. Std of Reward: 0.7577328763357482.
Step: 53000. Mean Reward: 0.3457333333333331. Std of Reward: 0.7156619437819632.
Step: 53500. Mean Reward: 0.5109740259740259. Std of Reward: 0.5622254244489292.
Step: 54000. Mean Reward: 0.43502415458937177. Std of Reward: 0.6095399979315481.
Saved Model
Step: 54500. Mean Reward: 0.38842233009708715. Std of Reward: 0.64892746

Step: 98000. Mean Reward: 0.438862559241706. Std of Reward: 0.6060640520097476.
Saved Model
Step: 98500. Mean Reward: 0.5087179487179485. Std of Reward: 0.5655536406452304.
Step: 99000. Mean Reward: 0.411126126126126. Std of Reward: 0.6419196943560077.
Step: 99500. Mean Reward: 0.41868663594470024. Std of Reward: 0.6826974510257602.
Step: 100000. Mean Reward: 0.35380952380952363. Std of Reward: 0.6546997156613671.
Saved Model
Step: 100500. Mean Reward: 0.4926939655172412. Std of Reward: 0.6246633274990663.
Step: 101000. Mean Reward: 0.4017391304347824. Std of Reward: 0.6725771471761376.
Step: 101500. Mean Reward: 0.49020304568527895. Std of Reward: 0.47945157903637153.
Step: 102000. Mean Reward: 0.43967821782178196. Std of Reward: 0.5681636394637597.
Saved Model
Step: 102500. Mean Reward: 0.3880985915492956. Std of Reward: 0.6587465153610785.
Step: 103000. Mean Reward: 0.34099502487562167. Std of Reward: 0.6826772763589252.
Step: 103500. Mean Reward: 0.36364285714285693. Std of Reward:

Saved Model
Step: 146500. Mean Reward: 0.5989189189189187. Std of Reward: 0.46873977640247566.
Step: 147000. Mean Reward: 0.6553831417624519. Std of Reward: 0.34407024858451596.
Step: 147500. Mean Reward: 0.6335098039215684. Std of Reward: 0.3918474779583435.
Step: 148000. Mean Reward: 0.6351937984496123. Std of Reward: 0.3829482746361604.
Saved Model
Step: 148500. Mean Reward: 0.6065975103734439. Std of Reward: 0.42040515904676756.
Step: 149000. Mean Reward: 0.5455793991416308. Std of Reward: 0.5057493347682458.
Step: 149500. Mean Reward: 0.6608914728682169. Std of Reward: 0.3068039482152509.


INFO:unityagents:
Lesson changed. Now in Lesson 1 : 	min_wall_height -> 0.5, max_wall_height -> 2.0


Step: 150000. Mean Reward: 0.5290823970037452. Std of Reward: 0.4533588759076984.
Saved Model
Step: 150500. Mean Reward: 0.6436764705882351. Std of Reward: 0.3312717922955961.
Step: 151000. Mean Reward: 0.6506201550387595. Std of Reward: 0.33396520019698406.
Step: 151500. Mean Reward: 0.578734939759036. Std of Reward: 0.4814180713911187.
Step: 152000. Mean Reward: 0.5920080321285139. Std of Reward: 0.43907805923794424.
Saved Model
Step: 152500. Mean Reward: 0.6391439688715952. Std of Reward: 0.35072832321452263.
Step: 153000. Mean Reward: 0.6772340425531913. Std of Reward: 0.3635165743594264.
Step: 153500. Mean Reward: 0.6169618055555554. Std of Reward: 0.47936052810207014.
Step: 154000. Mean Reward: 0.5474999999999999. Std of Reward: 0.5815836855804445.
Saved Model
Step: 154500. Mean Reward: 0.6386815068493149. Std of Reward: 0.46556993861308865.
Step: 155000. Mean Reward: 0.6221828358208954. Std of Reward: 0.4334896065179906.
Step: 155500. Mean Reward: 0.5985742971887549. Std of Rewa

INFO:unityagents:
Lesson changed. Now in Lesson 2 : 	min_wall_height -> 1.0, max_wall_height -> 2.5


Step: 180000. Mean Reward: 0.5116216216216214. Std of Reward: 0.45560672314336204.
Saved Model
Step: 180500. Mean Reward: 0.6253497942386831. Std of Reward: 0.389194563821475.
Step: 181000. Mean Reward: 0.5878870292887028. Std of Reward: 0.42681244325184087.
Step: 181500. Mean Reward: 0.5489497716894975. Std of Reward: 0.47283975703538433.
Step: 182000. Mean Reward: 0.5623404255319147. Std of Reward: 0.4453490754498928.
Saved Model
Step: 182500. Mean Reward: 0.6199559471365635. Std of Reward: 0.3176680557658559.
Step: 183000. Mean Reward: 0.6056249999999999. Std of Reward: 0.41504822944098535.
Step: 183500. Mean Reward: 0.62884. Std of Reward: 0.3488232423448874.
Step: 184000. Mean Reward: 0.6247180451127818. Std of Reward: 0.4152818766124944.
Saved Model
Step: 184500. Mean Reward: 0.636706349206349. Std of Reward: 0.3698920170068582.
Step: 185000. Mean Reward: 0.6088385826771652. Std of Reward: 0.443373486744074.
Step: 185500. Mean Reward: 0.6551185770750986. Std of Reward: 0.30419917

Saved Model
Step: 228500. Mean Reward: 0.5550216450216449. Std of Reward: 0.494480683867065.
Step: 229000. Mean Reward: 0.5405627705627704. Std of Reward: 0.49600013458238496.
Step: 229500. Mean Reward: 0.563048245614035. Std of Reward: 0.46933489926955224.


INFO:unityagents:
Lesson changed. Now in Lesson 3 : 	min_wall_height -> 1.5, max_wall_height -> 3.0


Step: 230000. Mean Reward: 0.5288431372549018. Std of Reward: 0.39988961087314495.
Saved Model
Step: 230500. Mean Reward: 0.5842660550458714. Std of Reward: 0.3940575364826174.
Step: 231000. Mean Reward: 0.5847321428571427. Std of Reward: 0.40330655617352795.
Step: 231500. Mean Reward: 0.5646527777777776. Std of Reward: 0.40092311860171026.
Step: 232000. Mean Reward: 0.5398786407766988. Std of Reward: 0.4441596295636006.
Saved Model
Step: 232500. Mean Reward: 0.5622558139534881. Std of Reward: 0.4392071658787243.
Step: 233000. Mean Reward: 0.5622844827586206. Std of Reward: 0.4661033407324752.
Step: 233500. Mean Reward: 0.5803695652173911. Std of Reward: 0.4142528296823222.
Step: 234000. Mean Reward: 0.6233549783549781. Std of Reward: 0.32248141589982693.
Saved Model
Step: 234500. Mean Reward: 0.6248695652173911. Std of Reward: 0.33727195894576734.
Step: 235000. Mean Reward: 0.6205118110236219. Std of Reward: 0.3932918486403658.
Step: 235500. Mean Reward: 0.5914403292181067. Std of Rew

INFO:unityagents:
Lesson changed. Now in Lesson 4 : 	min_wall_height -> 2.0, max_wall_height -> 3.5


Step: 260000. Mean Reward: 0.603450704225352. Std of Reward: 0.3187849307557822.
Saved Model
Step: 260500. Mean Reward: 0.619111111111111. Std of Reward: 0.3625813148475019.
Step: 261000. Mean Reward: 0.5704872881355931. Std of Reward: 0.46385072555847146.
Step: 261500. Mean Reward: 0.572142857142857. Std of Reward: 0.4258951781907404.
Step: 262000. Mean Reward: 0.5535355648535563. Std of Reward: 0.5097767476230646.
Saved Model
Step: 262500. Mean Reward: 0.6001508620689653. Std of Reward: 0.3873243190179325.
Step: 263000. Mean Reward: 0.5885805084745761. Std of Reward: 0.45116280776633827.
Step: 263500. Mean Reward: 0.5635042735042733. Std of Reward: 0.46089135349377836.
Step: 264000. Mean Reward: 0.5562612612612611. Std of Reward: 0.47419988015303.
Saved Model
Step: 264500. Mean Reward: 0.5616063348416287. Std of Reward: 0.4400560677519397.
Step: 265000. Mean Reward: 0.5427722772277226. Std of Reward: 0.44564975105747184.
Step: 265500. Mean Reward: 0.5447737556561083. Std of Reward: 0

Saved Model
Step: 308500. Mean Reward: 0.5890170940170939. Std of Reward: 0.43001850452940876.
Step: 309000. Mean Reward: 0.619478260869565. Std of Reward: 0.36147786865506754.
Step: 309500. Mean Reward: 0.6096666666666665. Std of Reward: 0.408292548983637.
Step: 310000. Mean Reward: 0.4693137254901959. Std of Reward: 0.5252394594959626.
Saved Model
Step: 310500. Mean Reward: 0.5995661157024792. Std of Reward: 0.4204669749619732.
Step: 311000. Mean Reward: 0.608676470588235. Std of Reward: 0.4062141158721122.
Step: 311500. Mean Reward: 0.6054893617021275. Std of Reward: 0.40574396456440814.
Step: 312000. Mean Reward: 0.6088744588744586. Std of Reward: 0.3434483102523339.
Saved Model
Step: 312500. Mean Reward: 0.6174795081967211. Std of Reward: 0.3905669213744161.
Step: 313000. Mean Reward: 0.6030930232558138. Std of Reward: 0.33946516168501994.
Step: 313500. Mean Reward: 0.6325103734439832. Std of Reward: 0.33152549593765346.
Step: 314000. Mean Reward: 0.5907142857142855. Std of Reward

INFO:unityagents:
Lesson changed. Now in Lesson 5 : 	min_wall_height -> 2.5, max_wall_height -> 4.0


Step: 320000. Mean Reward: 0.5436466165413533. Std of Reward: 0.42834404335887927.
Saved Model
Step: 320500. Mean Reward: 0.5769191919191916. Std of Reward: 0.4135505948250531.
Step: 321000. Mean Reward: 0.4474226804123709. Std of Reward: 0.5921514649342204.
Step: 321500. Mean Reward: 0.4802020202020199. Std of Reward: 0.5635251238839704.
Step: 322000. Mean Reward: 0.4568817204301072. Std of Reward: 0.5516051425809693.
Saved Model
Step: 322500. Mean Reward: 0.40629120879120845. Std of Reward: 0.5948751903723309.
Step: 323000. Mean Reward: 0.38267441860465096. Std of Reward: 0.6287488488474454.
Step: 323500. Mean Reward: 0.45502564102564075. Std of Reward: 0.5858545145025775.
Step: 324000. Mean Reward: 0.47322580645161266. Std of Reward: 0.5144545839079672.
Saved Model
Step: 324500. Mean Reward: 0.43154696132596654. Std of Reward: 0.5831326945964711.
Step: 325000. Mean Reward: 0.46089005235602065. Std of Reward: 0.5818419420281926.
Step: 325500. Mean Reward: 0.42050505050505027. Std of 

Saved Model
Step: 368500. Mean Reward: 0.4910937499999997. Std of Reward: 0.48825299918273685.
Step: 369000. Mean Reward: 0.4543850267379676. Std of Reward: 0.5693884714617566.
Step: 369500. Mean Reward: 0.4724734042553189. Std of Reward: 0.5485661913697774.
Step: 370000. Mean Reward: 0.3603023255813952. Std of Reward: 0.6124958674709654.
Saved Model
Step: 370500. Mean Reward: 0.49582887700534733. Std of Reward: 0.5136867192980865.
Step: 371000. Mean Reward: 0.5162686567164176. Std of Reward: 0.4878567784517094.
Step: 371500. Mean Reward: 0.5153640776699026. Std of Reward: 0.5128563061408344.
Step: 372000. Mean Reward: 0.507153846153846. Std of Reward: 0.4764394942865006.
Saved Model
Step: 372500. Mean Reward: 0.4848477157360403. Std of Reward: 0.5318745960718174.
Step: 373000. Mean Reward: 0.5182249999999997. Std of Reward: 0.48524449958242716.
Step: 373500. Mean Reward: 0.47242857142857114. Std of Reward: 0.5147245746562935.
Step: 374000. Mean Reward: 0.4506878306878304. Std of Rewar

Step: 416500. Mean Reward: 0.5978271028037382. Std of Reward: 0.3373506218597743.
Step: 417000. Mean Reward: 0.5383490566037734. Std of Reward: 0.4797836195158333.
Step: 417500. Mean Reward: 0.5237894736842102. Std of Reward: 0.46828281816006617.
Step: 418000. Mean Reward: 0.5226108374384234. Std of Reward: 0.49078386209350555.
Saved Model
Step: 418500. Mean Reward: 0.5146249999999998. Std of Reward: 0.4907937289483232.
Step: 419000. Mean Reward: 0.5051666666666664. Std of Reward: 0.5183757561877048.
Step: 419500. Mean Reward: 0.5551913875598085. Std of Reward: 0.45382975511130885.
Step: 420000. Mean Reward: 0.424366812227074. Std of Reward: 0.5441229599822035.
Saved Model
Step: 420500. Mean Reward: 0.5614386792452828. Std of Reward: 0.4404614657873475.
Step: 421000. Mean Reward: 0.5426341463414632. Std of Reward: 0.4577464819754017.
Step: 421500. Mean Reward: 0.5112499999999998. Std of Reward: 0.5121749579001301.
Step: 422000. Mean Reward: 0.5014540816326527. Std of Reward: 0.48373103

Step: 465000. Mean Reward: 0.5426872246696033. Std of Reward: 0.5086501129596684.
Step: 465500. Mean Reward: 0.5662895927601808. Std of Reward: 0.4570842940669037.
Step: 466000. Mean Reward: 0.5487616822429904. Std of Reward: 0.4638892212153533.
Saved Model
Step: 466500. Mean Reward: 0.593943965517241. Std of Reward: 0.41089560650649903.
Step: 467000. Mean Reward: 0.602200956937799. Std of Reward: 0.35165356184587837.
Step: 467500. Mean Reward: 0.48242990654205586. Std of Reward: 0.5851489004869727.
Step: 468000. Mean Reward: 0.58008658008658. Std of Reward: 0.4417811169283398.
Saved Model
Step: 468500. Mean Reward: 0.6104203539823007. Std of Reward: 0.394470828841569.
Step: 469000. Mean Reward: 0.542455357142857. Std of Reward: 0.4946628070209772.
Step: 469500. Mean Reward: 0.6127354260089686. Std of Reward: 0.38547758154663525.
Step: 470000. Mean Reward: 0.4950602409638552. Std of Reward: 0.47492558747671476.
Saved Model
Step: 470500. Mean Reward: 0.625478260869565. Std of Reward: 0.

INFO:tensorflow:Restoring parameters from ./models/ppo/model-500001.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.


### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [5]:
export_graph(model_path, env_name)

INFO:tensorflow:Restoring parameters from ./models/ppo/model-500001.cptk


INFO:tensorflow:Restoring parameters from ./models/ppo/model-500001.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
