In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('Pendulum-v0')
env._max_episode_steps = 600

In [28]:
sample = env.action_space.sample()
observation, reward, done, info = env.step(sample)
print(f"Acción tomada: {sample} \n Estado: {observation} \n Reward: {reward} \n Hecho?: {done}")

Acción tomada: [1.3656635] 
 Estado: [ 0.18549325 -0.98264554 -1.30851324] 
 Reward: -1.803035108547747 
 Hecho?: False


In [29]:
env.render()
next_state, reward, done, info = env.step([-2])
env.render()

angle = next_state[0]
vel = next_state[2]

env.close()
print(f"Seno del ángulo: {round(angle,6)} \t Ángulo en radianes: {round(np.arccos(angle), 6)}")

Seno del ángulo: 0.069243 	 Ángulo en radianes: 1.501497


In [220]:
def muestra_entorno(env):
    im = plt.imshow(env.render('rgb_array'))
    plt.show()

def discretize_state(state):
    angle_bins = np.linspace(-1, 1, 40)
    vel_bins = np.linspace(-8, 8, 160)

    return [np.digitize([state[0]], angle_bins)[0]-1, np.digitize([state[2]], vel_bins)[0]-1]

In [221]:
env.seed(123)
env._max_episode_steps = 600

def get_epsilon_greedy_action(q_values, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(40)
    else:
        return np.argmax(q_values)


def sarsa(num_episodes = 10000, ALPHA = 0.2, GAMMA = 1, EPSILON = 0.25):
    acciones = np.linspace(-2,2,40)
    q_values = np.zeros((40, 160, 40))
    last100 = []

    for i in range(num_episodes):
        s = discretize_state(env.reset())
        a = get_epsilon_greedy_action(q_values[s[0],s[1]], EPSILON)
        returns, num_steps, done = 0, 0, False

        while not done:
            # env.render()
            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            a_prime = get_epsilon_greedy_action(q_values[next_state[0],next_state[1]], EPSILON)
            q_values[s[0], s[1], a] = q_values[s[0], s[1], a] + ALPHA * (reward + GAMMA * q_values[next_state[0], next_state[1], a_prime] - q_values[s[0], s[1], a])
            s, a = next_state, a_prime
            num_steps += 1
            returns += reward

        last100.append(returns)
        if len(last100)==100:
            last100.pop(0)
            print(f'Terminado episodio {i} con retorno {returns}, media ultimos 100 {np.mean(last100)}')
    env.close()
    return q_values
            
sarsa()

Terminado episodio 99 con retorno -2953.3317781218757, media ultimos 100 -3764.1047972146166
Terminado episodio 100 con retorno -4215.887675699459, media ultimos 100 -3761.47704884611
Terminado episodio 101 con retorno -2789.258643416453, media ultimos 100 -3745.2311335803884
Terminado episodio 102 con retorno -2763.299588940682, media ultimos 100 -3724.8142820455687
Terminado episodio 103 con retorno -4435.928150392581, media ultimos 100 -3730.147237158031
Terminado episodio 104 con retorno -4428.123869622352, media ultimos 100 -3735.2720348013595
Terminado episodio 105 con retorno -4454.674115350085, media ultimos 100 -3738.6732849705845
Terminado episodio 106 con retorno -2427.1855218478336, media ultimos 100 -3715.6029886229235
Terminado episodio 107 con retorno -3871.630068441394, media ultimos 100 -3712.825213664229
Terminado episodio 108 con retorno -2596.3456713939236, media ultimos 100 -3687.395423574116
Terminado episodio 109 con retorno -4098.545925391702, media ultimos 100 

In [206]:
env.reset()

array([0.92706834, 0.37489239, 0.91564592])

In [219]:
acciones = np.linspace(-2,2,40)
next_state, reward, done, _ = env.step([acciones[2]])
print(next_state)

[-0.13453334  0.99090907  4.06010332]


In [44]:
env.reset()
env.seed(123)
env._max_episode_steps = 600

def q_learning(num_episodes = 4000, ALPHA = 0.1,  GAMMA = 1, EPSILON = 0.25):
    acciones = np.linspace(-2,2,40)
    q_values = np.zeros((40, 160, 40))

    for i in range(num_episodes):
        state = discretize_state(env.reset())
        returns, num_steps, done = 0, 0, False

        while not done:
            # env.render()
            a = get_epsilon_greedy_action(q_values[state[0], state[1]], EPSILON)
            num_steps += 1

            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            returns -= reward

            a_prime = get_epsilon_greedy_action(q_values[next_state[0],next_state[1]], EPSILON)
            
            q_values[state[0], state[1], a] = q_values[state[0], state[1], a] + ALPHA * (reward + GAMMA * np.max(q_values[next_state[0], next_state[1], a_prime]) \
                - q_values[state[0], state[1], a])
            state = next_state

        print(f'Terminado episodio {i} con retorno {returns} en {num_steps} pasos')
    return q_values

q_values_q = q_learning()

Terminado episodio 0 con retorno 3834.317139536525 en 600 pasos
Terminado episodio 1 con retorno 4105.449745800219 en 600 pasos
Terminado episodio 2 con retorno 4932.5878981682945 en 600 pasos
Terminado episodio 3 con retorno 4200.292356879639 en 600 pasos
Terminado episodio 4 con retorno 3622.083335610173 en 600 pasos
Terminado episodio 5 con retorno 3116.8513238208184 en 600 pasos
Terminado episodio 6 con retorno 2946.7544201567575 en 600 pasos
Terminado episodio 7 con retorno 3057.3037792630203 en 600 pasos
Terminado episodio 8 con retorno 3120.2691695238595 en 600 pasos
Terminado episodio 9 con retorno 5219.237049952122 en 600 pasos
Terminado episodio 10 con retorno 3343.399887721451 en 600 pasos
Terminado episodio 11 con retorno 2788.936208450701 en 600 pasos
Terminado episodio 12 con retorno 3861.815024309242 en 600 pasos
Terminado episodio 13 con retorno 3461.970266452869 en 600 pasos
Terminado episodio 14 con retorno 3823.597034856425 en 600 pasos
Terminado episodio 15 con reto

In [244]:
policy_aleatoria = np.zeros((20, 160, 40))
policy_aleatoria[:] = 1 / 40
acciones = np.linspace(-2,2,40)

def sample_policy(state, policy):

    probs = policy[state[0],state[1],:]
    r = np.random.random()

    p_acumulada = probs[0]
    i=0
    while p_acumulada<r:
        i+=1
        p_acumulada+=probs[i]
    return i


def greedify_policy(q_values):
    
    policy = np.zeros((20, 160, 40))

    for i in range(q_values.shape[0]):
      for j in range(q_values.shape[1]):
            v=q_values[i,j]
            policy[i,j,np.argmax(v)]=1.0
    return policy  



env.reset()
index = sample_policy(discretize_state(env.observation_space.sample()), policy_aleatoria)
print(f"Acción tomada: {acciones[index]}")

Acción tomada: -0.6666666666666667


In [232]:
def simula_episodio(policy):
    state = discretize_state(env.reset())
    acciones = np.linspace(-2,2,40)
    recorrido, returns, done = [], 0, False
    
    while not done:
        # state = discretize_state(env.observation_space.sample())
        index = sample_policy(state, policy)
        next_state, reward, done, _ = env.step([acciones[index]])
        next_state = discretize_state(next_state)
        state = next_state

        recorrido.append((state, acciones[index], reward))
        returns += reward

    return returns, recorrido

simula_episodio(policy_aleatoria)

(-4539.213870095841,
 [([4, 87], -0.35897435897435903, -4.417017252709045),
  ([4, 87], -1.282051282051282, -4.56052054825472),
  ([3, 87], 1.0769230769230766, -4.838571961474032),
  ([2, 86], 0.8717948717948718, -5.441832069514177),
  ([1, 85], 0.15384615384615374, -6.313456584149044),
  ([0, 83], 1.692307692307692, -7.348733544391315),
  ([0, 81], -0.2564102564102564, -8.767960140402058),
  ([0, 79], -1.6923076923076923, -10.087736795496326),
  ([0, 77], -1.076923076923077, -11.180303018247638),
  ([0, 76], 0.5641025641025639, -10.288677712120636),
  ([1, 74], -1.6923076923076923, -9.201101096076503),
  ([1, 73], 0.2564102564102564, -7.9864361913071455),
  ([2, 72], -1.8974358974358974, -7.061319451992842),
  ([3, 72], 1.282051282051282, -6.174035375042948),
  ([3, 71], -1.3846153846153846, -5.62542358017498),
  ([3, 71], 1.5897435897435894, -5.1716608965591275),
  ([3, 71], 0.0512820512820511, -4.954420433685631),
  ([3, 71], -0.6666666666666667, -4.885223422581001),
  ([3, 72], -0.

In [None]:
def mc_control(num_pasos=5000, epsilon=0.2, GAMMA = 1):
    qsa = np.zeros((20, 160, 40, 40))
    policy = np.zeros((20, 160, 40, 40))
    # Partimos de la política aleatoria
    policy[:] = 1.0 / 40
    
    returns = []
    # Creamos la estructura
    for i in range(qsa.shape[0]):
      r_i = []
      for j in range(qsa.shape[1]):
        r_j = []
        for k in range(qsa.shape[2]):
          r_j.append([])
        r_i.append(r_j)
      returns.append(r_i)

    for i in range(num_pasos):


# Calculamos la política usando un epsilon de 0.4 para garantizar la exploración
# policy_montecarlo_es = mc_control(epsilon=0.4)

In [238]:
_, episode = simula_episodio(policy_aleatoria)

for state, action, reward in reversed(episode):
    print(state, action, reward)

[8, 89] 1.3846153846153846 -2.7855727920799125
[8, 89] -0.5641025641025641 -3.111376154206029
[8, 89] 1.1794871794871793 -3.7680767045305443
[7, 89] -0.6666666666666667 -4.574997171897949
[6, 88] 1.0769230769230766 -5.7805014737179325
[5, 88] 1.282051282051282 -7.307572252601721
[3, 87] -0.15384615384615397 -8.857240024737338
[1, 85] 0.35897435897435903 -10.576563379928077
[0, 83] 0.4615384615384617 -12.344516964143713
[0, 80] 1.1794871794871793 -12.153727684153866
[0, 77] 0.6666666666666665 -10.275170250795982
[0, 75] -1.282051282051282 -8.159801540396858
[2, 72] 1.0769230769230766 -6.6367326780786025
[4, 71] 0.35897435897435903 -5.217676635266168
[5, 70] 0.0512820512820511 -4.0364767856587624
[7, 69] -1.794871794871795 -3.026846590430093
[9, 69] -0.4615384615384617 -2.457763409410942
[9, 69] -0.4615384615384617 -2.1636985993674123
[10, 69] -0.15384615384615397 -2.121237153498062
[10, 69] 1.8974358974358974 -2.2485557088713546
[10, 69] -0.7692307692307692 -2.6074511826379947
[10, 69] 

In [27]:
# expected sarsa implementation

env.reset()
env.seed(123)
env._max_episode_steps = 600

def expected_sarsa(num_episodes = 4000, ALPHA = 0.1,  GAMMA = 1, EPSILON = 0.25):
    acciones = np.linspace(-2,2,40)
    q_values = np.zeros((40, 160, 40))

    for i in range(num_episodes):
        state = discretize_state(env.reset())
        returns, num_steps, done = 0, 0, False

        while not done:
            # env.render()
            a = get_epsilon_greedy_action(q_values[state[0], state[1]], EPSILON)
            num_steps += 1

            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            returns -= reward

            a_prime = get_epsilon_greedy_action(q_values[next_state[0],next_state[1]], EPSILON)
            
            q_values[state[0], state[1], a] = q_values[state[0], state[1], a] + ALPHA * (reward + GAMMA * np.mean(q_values[next_state[0], next_state[1], a_prime]) \
                - q_values[state[0], state[1], a])
            state = next_state

        print(f'Terminado episodio {i} con retorno {returns} en {num_steps} pasos')
    return q_values

expected_q = expected_sarsa()
    

IndexError: index 24 is out of bounds for axis 0 with size 20

In [193]:
env.step([0.5])
print(env.state)
env.render()


[-3.41160696 -3.74963214]


True

In [194]:
env.close()