In [1]:
import numpy as np
import gym
from gym import wrappers
from cv2 import VideoWriter, VideoWriter_fourcc

env = gym.make('BipedalWalker-v2')
class Normalizer():
    def __init__(self):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
        
class Hp():
    def __init__(self):
        self.nb_steps = 1000
        self.episode_length = 1000
        self.learning_rate = 0.02
        self.nb_directions = 16
        self.nb_best_directions = 16
        self.noise = 0.03

def record(theta, directory):
    
    done = False
    state = env.reset()
    sum_rewards = 0
    num_steps = 0
    images = []
    while not done and num_steps < hp.episode_length:
        state = normalize(state)
        action = theta.dot(state)
        state, reward, done, _ = env.step(action)
        images.append(env.render(mode='rgb_array'))
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_steps +=1
    print(sum_rewards)
    for x in range(len(images)):
        blue = images[x][:,:,0].copy() #0-> blue
        #green = images[x][:,:,1].copy()  #1-> green
        red = images[x][:,:,2].copy() #2-> red
    
        images[x][:,:,0] = red
        #images[x][:,:,1] = green
        images[x][:,:,2] = blue
    FPS = 24

    fourcc = VideoWriter_fourcc(*'MP42')
    #./noise.avi
    video = VideoWriter(directory, fourcc, float(FPS), (images[0].shape[1], images[0].shape[0]))

    for x in range(len(images)):
        frame = images[x]
        video.write(frame)
    video.release()
    env.close()
    
def normalize(state):
    normalizer.n += 1
    last_mean = normalizer.mean.copy()
    #Incremental mean 
    normalizer.mean += (state - normalizer.mean) / normalizer.n
    
    #Get variance
    normalizer.mean_diff += (state - last_mean) * (state - normalizer.mean)
    normalizer.var = (normalizer.mean_diff / normalizer.n).clip(min=1e-2)
    
    obs_mean = normalizer.mean
    #Get standard derivation
    obs_std = np.sqrt(normalizer.var)
    
    return (state - obs_mean) / obs_std

def get_deltas():
    delta = []
    for _ in range(hp.nb_directions):
        delta.append(np.random.randn(*weights.shape))
    return delta

def evaluate(state, delta=None, direction=None):
    if direction is None:
        return weights.dot(state)
    elif direction == "positive":
        return (weights + hp.noise*delta).dot(state)
    else:
        return (weights - hp.noise*delta).dot(state)

def explore(direction=None, delta=None, render=False):
    state = env.reset()
    done = False
    num_plays = 0
    sum_rewards = 0
    while not done and num_plays < hp.episode_length:
        state = normalize(state)
        action = evaluate(state, delta, direction)
        state, reward, done, _ = env.step(action)
        #if render:
        #    env.render()
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_plays +=1
    return sum_rewards

def select_rewards_delta(positive_rewards, negative_rewards, deltas):
    r_to_delete = hp.nb_directions - hp.nb_best_directions
    top_rewards = np.maximum(positive_rewards, negative_rewards)
    index = np.argsort(top_rewards)
    index_to_slice = index[:r_to_delete]
    positive_rewards = np.delete(positive_rewards, index_to_slice)
    negative_rewards = np.delete(negative_rewards, index_to_slice)
    for ele in sorted(index_to_slice, reverse = True):  
        del deltas[ele]
    rollouts = []
    for k in range(hp.nb_best_directions):
        rollouts.append((positive_rewards[k],negative_rewards[k], deltas[k]))
    return rollouts

def train(weights):
    for step in range(hp.nb_steps+1):
        deltas = get_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        
        for k in range(hp.nb_directions):
            positive_rewards[k] = explore(direction ="positive",delta=deltas[k])
        for k in range(hp.nb_directions):
            negative_rewards[k] = explore(direction ="negative",delta=deltas[k])
            
        all_rewards = np.array(positive_rewards + negative_rewards)
        std_r = all_rewards.std()
        
        
        rollouts = select_rewards_delta(positive_rewards, negative_rewards, deltas)
        change = np.zeros(weights.shape)
        for r_pos, r_neg, d in rollouts:
            change += (r_pos - r_neg) * d
        weights += hp.learning_rate / (hp.nb_best_directions * std_r) * change
        
        if step % 5 == 0:
            reward_evaluation = explore()
            print('Step: ', step, 'Reward: ', reward_evaluation)
            if reward_evaluation > 300:
                record(weights, str("./videos/"+str(step)+".avi"))
                return weights
            if step % 100 == 0:
                record(weights, str("./videos"+str(step)+".avi"))
    return weights
        

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]
weights = np.zeros((nb_outputs, nb_inputs))
hp = Hp()
normalizer = Normalizer()

weights = train(weights)


Step:  0 Reward:  3.396299866428019
3.293887263511076
Step:  5 Reward:  6.184347310942232
Step:  10 Reward:  4.3099152809357495
Step:  15 Reward:  3.2708267312283485
Step:  20 Reward:  4.376835916242026
Step:  25 Reward:  5.3402004016314955
Step:  30 Reward:  4.683849774803049
Step:  35 Reward:  -12.157536014746771
Step:  40 Reward:  5.696218181535611
Step:  45 Reward:  3.6902223509357306
Step:  50 Reward:  6.54139221284329
Step:  55 Reward:  9.414753497953038
Step:  60 Reward:  4.698239368437971
Step:  65 Reward:  5.806239317061675
Step:  70 Reward:  10.490950807273032
Step:  75 Reward:  10.318066050365724
Step:  80 Reward:  106.33421207353017
Step:  85 Reward:  8.86372408274554
Step:  90 Reward:  78.72353061390903
Step:  95 Reward:  34.27130771851187
Step:  100 Reward:  92.79043255585516
121.86234917377433
Step:  105 Reward:  51.61904278104626
Step:  110 Reward:  54.833878020978105
Step:  115 Reward:  142.8315329165361
Step:  120 Reward:  136.62695802505746
Step:  125 Reward:  148.85