# Augmented Random Search

In [1]:
# Import the libraries
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import gym
from gym import wrappers
import pybullet_envs

In [2]:
# Set Hyperparameters

class Hyperparameters():
    """Defining the hyperparameters"""
    
    def __init__(self):
        
        self.number_of_steps = 1000
        self.episode_length = 1000
        self.learning_rate = 2e-2 # α
        self.number_of_directions = 16 # N
        self.number_of_best_directions = 16 # b
        assert self.number_of_best_directions <= self.number_of_directions # b <= N
        self.noise = 3e-2 # exploration noise ν
        self.seed = 1
        self.env_name = 'HalfCheetahBulletEnv-v0'

## 3.2 Normalization of the states

<br>
<br>
$  (M + \nu\delta) $ diag $ (\sum)^{-1/_2}(x - \mu)    $

where

$M$ -> policy weights
<br>
<br>
$\delta$ -> one perturbation direction
<br>
<br>
$\sum$ -> covariance matrix
<br>
<br>
$\nu$ -> exploration noise

In [3]:
class Normalizer():
    """Class to Normalize the states"""
    
    def __init__(self, num_of_inputs):
        """Initialise the parameters"""
        
        self.n = np.zeros(num_of_inputs)
        self.mean = np.zeros(num_of_inputs)
        self.mean_diff = np.zeros(num_of_inputs)
        self.var = np.zeros(num_of_inputs)
    
    def observe(self, x):
        """Update the mean and variance online"""
        
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        """Calculate the online mean and standard deviation"""
        
        observed_mean = self.mean
        observed_standard_deviation = np.sqrt(self.var)
        return (inputs - observed_mean) / observed_standard_deviation

## AI Algorithm

$Rollouts$ $are$ $tuples$ $of$    $(reward_{positive direction}, reward_{opposite direction}, \delta)$

In [4]:
class Policy():
    
    def __init__(self, input_size, output_size):
        
        self.w_transpose = np.zeros((output_size, input_size))
    
    def evaluate(self, input, delta = None, direction = None):
        
        # to check the value after update
        if direction is None:
            return self.w_transpose.dot(input)
        
        # to move in specified direction
        elif direction == "positive":
            return (self.w_transpose + hp.noise*delta).dot(input)
        
        # to move in the opposite direction
        else:
            return (self.w_transpose - hp.noise*delta).dot(input)
    
    def sample_deltas(self):
        """Sample δ1, δ2, . . . , δN with i.i.d. standard normal entries"""
        
        return [np.random.randn(*self.w_transpose.shape) for _ in range(hp.number_of_directions)]
    
    def update(self, rollouts, sigma_r):
        
        step = np.zeros(self.w_transpose.shape)
        
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
            
        self.w_transpose += hp.learning_rate / (hp.number_of_best_directions * sigma_r) * step

## Exploring the policy on one specific direction and over one episode

In [5]:
def explore(env, normalizer, policy, direction = None, delta = None):
    
    state = env.reset()
    done = False
    num_plays = 0.
    sum_rewards = 0
    
    while not done and num_plays < hp.episode_length:
        
        # normalize the state
        normalizer.observe(state) 
        state = normalizer.normalize(state)
        
        # random search in a direction
        action = policy.evaluate(state, delta, direction)
        state, reward, done, _ = env.step(action)
        
        # clip the extremes [too large -> 1; too small -> -1]
        reward = max(min(reward, 1), -1)
        
        # add the rewards in that direction
        sum_rewards += reward
        num_plays += 1
        
    return sum_rewards

## Training

In [6]:
def train(env, policy, normalizer, hp):
    
    for step in range(hp.number_of_steps):
        
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.number_of_directions
        negative_rewards = [0] * hp.number_of_directions
        
        # Getting the positive rewards in the positive directions
        for k in range(hp.number_of_directions):
            positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
            
        # Getting the negative rewards in the negative/opposite directions
        for k in range(hp.number_of_directions):
            negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k])
            
        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array(positive_rewards + negative_rewards)
        sigma_r = all_rewards.std() # σR of the 2N rewards collected at each iteration
        
        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        ## Using top performing directions
        order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:hp.number_of_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
        
        # Policy update
        policy.update(rollouts, sigma_r)
        
        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, normalizer, policy)
        print('Step:', step, 'Reward:', reward_evaluation)

# Results

In [7]:
# Utility function to make directory (available online)
def make_directory(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    
    return path

# Make new directory
work_directoty = make_directory('exp', 'ars')
monitor_directory = make_directory(work_directoty, 'monitor_half_cheetah')

### Define instances

In [8]:
hp = Hyperparameters()

# fixing a seed
np.random.seed(hp.seed)

# make the environment
env = gym.make(hp.env_name)

# save training videos
env = wrappers.Monitor(env, monitor_directory, force=True) # force -> to prevent hault in training due to warnings

num_of_inputs = env.observation_space.shape[0]
num_of_outputs = env.action_space.shape[0]

# initialize as a perceptron with zero weights
policy = Policy(input_size= num_of_inputs, output_size= num_of_outputs)

normalizer = Normalizer(num_of_inputs= num_of_inputs)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
WalkerBase::__init__ start
[33mWARN: Environment '<class 'pybullet_envs.gym_locomotion_envs.HalfCheetahBulletEnv'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m


### Training

In [9]:
train(env= env, policy= policy, normalizer= normalizer, hp= hp)

Step: 0 Reward: -933.4173354627901
Step: 1 Reward: -954.627670537546
Step: 2 Reward: -939.9312735112593
Step: 3 Reward: -949.6295400080322
Step: 4 Reward: -897.1544945191193
Step: 5 Reward: -909.7915264135893
Step: 6 Reward: -941.9339710666563
Step: 7 Reward: -952.5609928070322
Step: 8 Reward: -931.9801230518622
Step: 9 Reward: -535.3256989332772
Step: 10 Reward: -291.75168933001316
Step: 11 Reward: -160.2747996898627
Step: 12 Reward: -106.35353678049378
Step: 13 Reward: -251.06120464302052
Step: 14 Reward: -53.37257028151366
Step: 15 Reward: -129.02371124346712
Step: 16 Reward: -17.75126755355177
Step: 17 Reward: 3.2058800826702516
Step: 18 Reward: 29.200707107287606
Step: 19 Reward: -9.37288900771242
Step: 20 Reward: -119.76327026636729
Step: 21 Reward: 24.036557575581693
Step: 22 Reward: 65.36978722848575
Step: 23 Reward: 52.249198845872925
Step: 24 Reward: -146.68744098236522
Step: 25 Reward: 164.1375792141783
Step: 26 Reward: 144.78538602094557
Step: 27 Reward: 160.47575977122204


Step: 230 Reward: 751.1851836953922
Step: 231 Reward: 798.9187672115443
Step: 232 Reward: 790.8420795004254
Step: 233 Reward: 806.0492805344406
Step: 234 Reward: 831.471598744019
Step: 235 Reward: 801.2800636568186
Step: 236 Reward: 811.8941251410398
Step: 237 Reward: 801.1559083070007
Step: 238 Reward: 782.6173713767678
Step: 239 Reward: 810.7054900992567
Step: 240 Reward: 853.1453238809738
Step: 241 Reward: 836.294337042002
Step: 242 Reward: 812.0272635790286
Step: 243 Reward: 822.3635681879681
Step: 244 Reward: 823.8351837209249
Step: 245 Reward: 814.4501332066355
Step: 246 Reward: 813.7082311274289
Step: 247 Reward: 837.5057648087738
Step: 248 Reward: 826.3030015221851
Step: 249 Reward: 810.771467453711
Step: 250 Reward: 838.1864045600043
Step: 251 Reward: 798.2936679344194
Step: 252 Reward: 809.3993036252406
Step: 253 Reward: 819.8232427371986
Step: 254 Reward: 826.2032165162772
Step: 255 Reward: 823.8221282339322
Step: 256 Reward: 852.6078482356214
Step: 257 Reward: 848.428337255

Step: 459 Reward: 922.860252650174
Step: 460 Reward: 912.8028077999735
Step: 461 Reward: 922.5408976593667
Step: 462 Reward: 927.2813106543076
Step: 463 Reward: 917.9559148063875
Step: 464 Reward: 926.3872679752573
Step: 465 Reward: 923.400513594767
Step: 466 Reward: 923.6235940776805
Step: 467 Reward: 919.4676074000326
Step: 468 Reward: 930.4356623154823
Step: 469 Reward: 915.5270654340736
Step: 470 Reward: 930.2841219694948
Step: 471 Reward: 925.9486829794273
Step: 472 Reward: 931.9882404532884
Step: 473 Reward: 926.1401280555464
Step: 474 Reward: 924.298893242059
Step: 475 Reward: 928.3783050653906
Step: 476 Reward: 917.1623322351008
Step: 477 Reward: 928.8791940898537
Step: 478 Reward: 922.8639353030009
Step: 479 Reward: 927.7352731970248
Step: 480 Reward: 921.7436990462527
Step: 481 Reward: 917.8069986342055
Step: 482 Reward: 922.5465087237696
Step: 483 Reward: 921.6736147647482
Step: 484 Reward: 909.5423880376017
Step: 485 Reward: 920.0771269641629
Step: 486 Reward: 926.791797844

Step: 688 Reward: 958.9833643308106
Step: 689 Reward: 959.5755588917361
Step: 690 Reward: 958.6637471318389
Step: 691 Reward: 963.2292185382804
Step: 692 Reward: 957.8195864652515
Step: 693 Reward: 963.2293485385369
Step: 694 Reward: 962.9286679410405
Step: 695 Reward: 966.2892732821181
Step: 696 Reward: 957.2384670575265
Step: 697 Reward: 957.0459237595808
Step: 698 Reward: 958.2060382720681
Step: 699 Reward: 956.7881333301713
Step: 700 Reward: 960.1412575136542
Step: 701 Reward: 959.9678910598172
Step: 702 Reward: 961.6041462223906
Step: 703 Reward: 961.3129803637843
Step: 704 Reward: 959.3385614988315
Step: 705 Reward: 956.4487017794651
Step: 706 Reward: 965.4038912834089
Step: 707 Reward: 960.302267396926
Step: 708 Reward: 958.8616348175708
Step: 709 Reward: 960.3080469375058
Step: 710 Reward: 957.8300650246654
Step: 711 Reward: 957.2787893359617
Step: 712 Reward: 963.4921527503028
Step: 713 Reward: 956.5734797721178
Step: 714 Reward: 958.233581443113
Step: 715 Reward: 961.17828528

Step: 917 Reward: 968.3365190354971
Step: 918 Reward: 967.4628680224571
Step: 919 Reward: 966.4764048754212
Step: 920 Reward: 972.3117916396732
Step: 921 Reward: 972.906998719464
Step: 922 Reward: 973.7297494234584
Step: 923 Reward: 968.107740181504
Step: 924 Reward: 968.0622239326831
Step: 925 Reward: 963.7882767604374
Step: 926 Reward: 964.2626421160311
Step: 927 Reward: 964.5433387898112
Step: 928 Reward: 975.2083085684443
Step: 929 Reward: 974.5518390297952
Step: 930 Reward: 966.2223541046975
Step: 931 Reward: 965.9602027172373
Step: 932 Reward: 966.5975677390495
Step: 933 Reward: 967.7212157490505
Step: 934 Reward: 957.3616359591194
Step: 935 Reward: 972.8966204294081
Step: 936 Reward: 972.128131828669
Step: 937 Reward: 967.7843411049867
Step: 938 Reward: 970.7523758654597
Step: 939 Reward: 960.8749541746492
Step: 940 Reward: 969.0471351236696
Step: 941 Reward: 970.4100315950328
Step: 942 Reward: 970.5738867361534
Step: 943 Reward: 973.7469732075651
Step: 944 Reward: 968.733911456