# Semaine 14 - Reinforcement Learning, Policy Gradient

Dans l'exercice de cette semaine, nous allons implémenter un joueur de Pong avec gym.

Nous aurons besoin d'implémenter plusieurs fonctions afin de faire fonctionner la loop de jeu

In [None]:
import numpy as np
import _pickle as pickle
import gym

Fonction sigmoïd

In [None]:
def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

Convertion d'un tableau à 3 dimensions représentant l'image du jeu, en un vecteur de 6400 (80x80) valeurs 1 ou 0

In [None]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

Fonction qui recalcule les rewards avec discount

In [None]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """

    # ...

    return discounted_r

Réseau de neurone pour obtenir la probabilité de prendre une action

In [None]:
def policy_forward(model, x):

    # ...

    return p, h # return probability of taking action 2, and hidden state

Backward propagation, lors du training

In [None]:
def policy_backward(model, epx, eph, epdlogp):

    # ...

    return {'W1':dW1, 'W2':dW2}

Fonction pour créer une image négative représentant la différence entre l'image précédente et la nouvelle (conserve le mouvement entre les deux étapes)

In [None]:
# preprocess the observation, set input to network to be difference image
def difference_image(observation, prev_x):
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    return x, cur_x

Fonction pour le choix de l'action

In [None]:
# forward the policy network and sample an action from the returned probability
def get_action(model, x):
    aprob, h = policy_forward(model, x)
    action = # ...
    return action, aprob, h

Ecrit et récupère l'historique des inputs, hidden states, actions et rewards de l'épisode

In [None]:
# record various intermediates (needed later for backprop)
def record_history(xs, hs, dlogps, x, action, aprob, h):
    xs.append(x) # observation
    hs.append(h) # hidden state for fitting
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

In [None]:
# stack together all inputs, hidden states, action gradients, and rewards for this episode
def get_history(xs, hs, dlogps, drs):

    # ...

    return epx, eph, epdlogp, epr

Fonction appliquant le discount au reward et le normalisant

In [None]:
# compute the discounted reward backwards through time
def standardize_reward(epr, gamma):
    discounted_epr = discount_rewards(epr, gamma)
    # standardize the rewards to be unit normal (helps control the gradient estimator variance)
   
    # ...

    return discounted_epr

Notre fonction du gradient ascent, utilisant l'optimisation rmsprop

In [None]:
"""Optimizer function, you don't have to understand it, works like Adam"""
def rmsprop_update(model, grad_buffer, rmsprop_cache, decay_rate):
    for k,v in model.items():
        g = grad_buffer[k] # gradient
        rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
        model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
        grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
        return model, grad_buffer, rmsprop_cache

Fonction pour mémoriser diverses données. Le modèle est conservé dans un fichier pour ne pas recommencer à 0.

In [None]:
# boring book-keeping
def book_keeping(model, episode_number, running_reward, reward_sum):
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print('resetting env. episode reward total was %f. running mean: %f', reward_sum, running_reward)
    if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
    reward_sum = 0
    return running_reward, reward_sum

# Implementation du Pong

Il n'y a plus qu'à tester !

In [None]:
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = True

# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume:
    model = pickle.load(open('save.p', 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
    model['W2'] = np.random.randn(H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

while True:
    if render: env.render()

    x, prev_x = difference_image(observation, prev_x)

    action, aprob, h = get_action(model, x)
    
    record_history(xs, hs, dlogps, x, action, aprob, h)
    
    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        epx, eph, epdlogp, epr = get_history(xs, hs, dlogps, drs)

        discounted_epr = standardize_reward(epr, gamma)
    
        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = policy_backward(model, epx, eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

        # perform rmsprop parameter update every batch_size episodes
        if episode_number % batch_size == 0:
            model, grad_buffer, rmsprop_cache = rmsprop_update(model, grad_buffer, rmsprop_cache, decay_rate)

        running_reward, reward_sum = book_keeping(model, episode_number, running_reward, reward_sum)

        observation = env.reset() # reset env
        prev_x = None

        if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
            print('ep %d: game finished, reward: %f', episode_number, reward, '' if reward == -1 else ' !!!!!!!!')