# OpenAI Gym Environment examples
We can also take a look at the NN implementation in the url: https://github.com/stefanknegt/REPS

In [21]:
import scipy
import scipy.optimize as opt
import pdb
import gym
import pylab
import numpy as np

env = gym.make('FrozenLake8x8-v0')
print(env.observation_space.n)
print(env.action_space.n)

64
4


In [22]:
# # print(env.observation_space.high)
# # print(env.observation_space.low)
# # print(env.observation_space)

# # print(env.action_space.high)
# # print(env.action_space.low)
# # print(env.action_space)

# env.n_states = 10*10*16
# env.n_actions = 20

# def discretize(env, n_observations, n_actions):
#     L = []
#     state_dim = env.observation_space.shape[0]
#     action_dim = env.action_space.shape[0]
#     for d in range(state_dim):
#         L.append(np.linspace(env.observation_space.low[d], env.observation_space.high[d], n_observations[d]))
#     for d in range(action_dim):
#         L.append(np.linspace(env.action_space.low[d], env.action_space.high[d], n_actions[d]))
#     res = np.meshgrid(*L)
#     return res[0:len(n_observations)], res[len(n_observations):]

# discrete_states, discrete_actions = discretize(env, [10,10,16], [20])
# print(np.ravel(discrete_actions[0])[41])
# def environment_check(name):
#     '''
#     This function checks the gym environment to get the action and state space.
#     Also it returns the min and max value of the actions.
#     '''
#     env = gym.make(name)
#     state_dim = env.observation_space.shape[0]
#     action_dim = env.action_space.shape[0]
#     action_min = float(env.action_space.low[0])
#     action_max = float(env.action_space.high[0])

#     return state_dim, action_dim, action_min, action_max

In [44]:
class Policy(object):
    def __init__(self, pi):
        n_states, n_actions = np.array(pi).shape()
        self.n_actions = n_actions
        self.n_states = n_states
        self.pi = pi
        
    def draw_action(self,state):
        u = np.random.rand()
        probas = np.cumsum(self.pi[state,:])
        a = 0
        while (a < self.n_actions-1 and (u > probas[a] or self.pi[state,a]==0)):
            a += 1
        return a

In [45]:
def collect_episodes(mdp, policy=None, horizon=None, n_episodes=1):
    paths = []
    for _ in range(n_episodes):
        observations = []
        actions = []
        rewards = []
        next_states = []

        state = mdp.reset()
        for _ in range(horizon):
            action = policy.draw_action(state)
            next_state, reward, terminal = mdp.step(action)
            observations.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            state = copy.copy(next_state)
            if terminal:
                break
        paths.append(dict(
            states=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
            next_states=np.array(next_states)
        ))
    return paths

In [46]:
def compute_phi(env, p):
    
    phi = np.zeros((env.observation_space.n,p))
    
    for k in range(env.observation_space.n):
        phi[k,:] = [k,k**2,np.log(k+1)]
    return(phi)
    
def initialize_pi(env):
    pi = np.zeros((env.observation_space.n, env.action_space.n))
    for s in range(env.observation_space.n):
        actions = np.linspace(-2, 2, 20)
        for a in actions:
            pi[s,a] = 1./len(actions)
    print(pi)
    return(pi)
    

In [47]:
def compute_new_policy(eta, policy, phi, theta, samples):
    log_new_pi = np.zeros((policy.n_states,policy.n_actions))
    A = np.zeros((policy.n_states,policy.n_actions))
    counter = np.zeros((policy.n_states,policy.n_actions))
    nb_samples = 0
    for i in range(len(samples)):
        states = samples[i]['states']
        actions = samples[i]['actions']
        rewards = samples[i]['rewards']
        next_states = samples[i]['next_states']

        for j in range(len(states)):
            A[states[j],actions[j]] += rewards[j] + np.dot(phi[next_states[j],:],theta) - np.dot(phi[states[j],:],theta)
            counter[states[j],actions[j]] += 1
            nb_samples += 1
    for s in range(env.observation_space.n):
        for a in range(env.action_space.n):
            if counter[s,a]!=0:
                A[s,a] /= counter[s,a]
    for s in range(policy.n_states):
        for a in range(policy.n_actions):
            argexpo = np.zeros(policy.n_actions)
            if policy.pi[s,a] == 0:
                log_new_pi[s,a] = -float('inf')
            else:
                for b in range(policy.n_actions):
                    argexpo[b] = np.log(policy.pi[s,b]+0.0001) + eta * A[s,b]
                maxi = np.max(argexpo)
                log_new_pi[s,a] = argexpo[a] - np.log(np.sum(np.exp(argexpo - maxi))) - maxi
    print(np.exp(log_new_pi))
    return(Policy(np.exp(log_new_pi)))

In [48]:

def g(theta, eta, phi, samples):
    res = 0
    A = np.zeros((env.observation_space.n,env.action_space.n))
    counter = np.zeros((env.observation_space.n,env.action_space.n))
    nb_samples = 0
    for i in range(len(samples)):
        states = samples[i]['states']
        actions = samples[i]['actions']
        rewards = samples[i]['rewards']
        next_states = samples[i]['next_states']

        for j in range(len(states)):
            A[states[j],actions[j]] += rewards[j] + np.dot(phi[next_states[j],:],theta) - np.dot(phi[states[j],:],theta)
            counter[states[j],actions[j]] += 1
            nb_samples += 1
    for s in range(env.observation_space.n):
        for a in range(env.action_space.n):
            if counter[s,a]!=0:
                A[s,a] /= counter[s,a]
    for i in range(len(samples)):
        states = samples[i]['states']
        actions = samples[i]['actions']
        for j in range(len(states)):
            res += np.exp(eta*A[states[j],actions[j]])
    res /= nb_samples
    return (np.log(res)/eta)

def Dg(theta,eta,phi,samples):
    n_states,p = np.shape(phi)
    numerator = 0
    denominator = 0
    A = np.zeros((env.observation_space.n,env.action_space.n))
    D = np.zeros((env.observation_space.n,env.action_space.n,p))
    counter = np.zeros((env.observation_space.n,env.action_space.n))
    for i in range(len(samples)):
        states = samples[i]['states']
        actions = samples[i]['actions']
        rewards = samples[i]['rewards']
        next_states = samples[i]['next_states']

        for j in range(len(states)):
            A[states[j],actions[j]] += rewards[j] + np.dot(phi[next_states[j],:],theta) - np.dot(phi[states[j],:],theta)
            D[states[j],actions[j],:] += phi[next_states[j],:] - phi[states[j],:]
            counter[states[j],actions[j]] += 1
    for s in range(env.observation_space.n):
        for a in range(env.action_space.n):
            if counter[s,a]!=0:
                A[s,a] /= counter[s,a]
    for s in range(env.observation_space.n):
        for a in range(env.action_space.n):
            if counter[s,a]!=0:
                D[s,a,:] /= counter[s,a]
    for i in range(len(samples)):
        states = samples[i]['states']
        actions = samples[i]['actions']
        for j in range(len(states)):
            numerator += np.exp(eta*A[states[j],actions[j]]) * D[states[j],actions[j]]
            denominator += np.exp(eta*A[states[j],actions[j]])
    return ((1/eta) * numerator / denominator)


In [49]:

def REPS_mirror_descent(env):
    """Relative Entropy Policy Search using Mirror Descent"""
    p = 3    
    # initialization of the distribution
    pi = initialize_pi(env)
    policy = Policy(pi)
    #Tmax =  -100*np.log(10e-6)/(1-env.gamma)
    K = 50
    N = 100
    eta = 0.1
    theta = [0 for i in range(p)]
    phi = compute_phi(env,p)
    for k in range(K):
        print('Iteration n°',k)
        ##### SAMPLING
        samples = collect_episodes(env, policy=policy, horizon=100, n_episodes=N)
        
        #### OPTIMIZE
        theta = opt.fmin_bfgs(g,x0=theta,fprime=Dg,args=(eta,phi,samples))
        
        #### COMPUTE THE NEW POLICY
        policy = compute_new_policy(eta,policy,phi,theta,samples)   
    return(policy, theta, phi)

In [50]:

for i_episode in range(1):
    observation = env.reset()
    action = env.action_space.sample()
    for t in range(500):
        if t%5==0:
            env.render()
        observation, reward, done, info = env.step(action)
        policy = REPS_mirror_descent(env)[0]
        action = policy.draw_action(observation)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
[[ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]
 [ 0.05  0.05  0.05  0.05]




TypeError: 'tuple' object is not callable

In [86]:
env.close()

# Deep Reinforcement Learning: A3C