## Proyecto 2 - Aprendizaje por refuerzo:

- Pedro Redondo Loureiro
- Pedro Souza López

***

### Carga de librerías, inicialización del entorno y funciones auxiliares.

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('Pendulum-v0')
env._max_episode_steps = 600

In [25]:
def muestra_entorno(env):
    im = plt.imshow(env.render('rgb_array'))
    plt.show()

def discretize_state(state, angle_bins = 21, vel_bins = 20):
    angle_bins = np.linspace(-1, 1, angle_bins)
    vel_bins = np.linspace(-8, 8, vel_bins)

    return [np.digitize([state[0]], angle_bins)[0]-1, np.digitize([state[1]], angle_bins)[0]-1, np.digitize([state[2]], vel_bins)[0]-1]

def get_epsilon_greedy_action(q_values, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(len(q_values))
    else:
        return np.argmax(q_values)

In [33]:
def simulacion(politica):
    
    acciones = np.linspace(-2,2,20)
    
    s = discretize_state(env.reset())
    
    returns, done = 0, False
    
    while not done:
        
        a = get_epsilon_greedy_action(politica[s[0],s[1],s[2]], 0)
        
        next_state, reward, done, _ = env.step([acciones[a]])
        
        s = discretize_state(next_state)

        returns += reward
        
        env.render()
        
    return returns

***
###  MonteCarlo

In [None]:
env.reset()
simulacion(motecarlo_prueba)

***
### SARSA

In [27]:
env.reset()
env._max_episode_steps = 600

def sarsa(num_episodes = 5000, ALPHA = 0.2, GAMMA = 0.9, EPSILON = 0.25):
    acciones = np.linspace(-2,2,20)
    q_values = np.zeros((21, 21, 20, len(acciones)))
    last100 = []
    for i in range(num_episodes):
        s = discretize_state(env.reset())
        a = get_epsilon_greedy_action(q_values[s[0],s[1],s[2]], EPSILON)
        returns, num_steps, done = 0, 0, False

        while not done:
            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            
            a_prime = get_epsilon_greedy_action(q_values[next_state[0],next_state[1],next_state[2]], EPSILON)

            q_values[s[0], s[1], s[2], a] = q_values[s[0], s[1], s[2], a] + ALPHA * (reward + GAMMA * q_values[next_state[0], next_state[1], next_state[2], a_prime] - q_values[s[0], s[1], s[2], a])
            s, a = next_state, a_prime
            num_steps += 1
            returns += reward

#         last100.append(returns)
#         if len(last100)==100:
#             last100.pop(0)
#             print(f'Terminado episodio {i} con retorno {returns}, media ultimos 100 {np.mean(last100)}')
    env.close()
    return q_values
            
sarsa_prueba = sarsa()

In [34]:
env.reset()
simulacion(sarsa_prueba)

-794.2183826986067

***
### Q-learning

In [14]:
env.reset()
env.seed(123)
env._max_episode_steps = 600
    
def q_learning(num_episodes = 5000, ALPHA = 0.1,  GAMMA = 0.9, EPSILON = 0.25):
    acciones = np.linspace(-2,2,20)
    q_values = np.zeros((20, 20, 20, 20))

    for i in range(num_episodes):
        state = discretize_state(env.reset())
        returns, num_steps, done = 0, 0, False

        while not done:
            a = get_epsilon_greedy_action(q_values[state[0], state[1], state[2]], EPSILON)
            num_steps += 1

            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            returns += reward
            
            q_values[state[0], state[1], state[2], a] = q_values[state[0], state[1], state[2], a] + ALPHA * (reward + GAMMA * np.max(q_values[next_state[0], next_state[1], next_state[2]]) \
                - q_values[state[0], state[1], state[2], a])
            state = next_state

#         print(f'Terminado episodio {i} con retorno {returns} en {num_steps} pasos')
    return q_values

q_values_q = q_learning()

In [17]:
env.reset()
simulacion(q_values_q)

***
### Expected SARSA

In [20]:
env.reset()
env._max_episode_steps = 600
    
def expected_sarsa(num_episodes = 5000, ALPHA = 0.1,  GAMMA = 0.9, EPSILON = 0.25):
    acciones = np.linspace(-2,2,20)
    q_values = np.zeros((20, 20, 20, 20))

    for i in range(num_episodes):
        state = discretize_state(env.reset())
        returns, num_steps, done = 0, 0, False

        while not done:
            a = get_epsilon_greedy_action(q_values[state[0], state[1], state[2]], EPSILON)
            num_steps += 1

            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            returns += reward
            
            q_values[state[0], state[1], state[2], a] = q_values[state[0], state[1], state[2], a] + ALPHA * (reward + GAMMA * np.mean(q_values[next_state[0], next_state[1], next_state[2]]) \
                - q_values[state[0], state[1], state[2], a])
            state = next_state

#         print(f'Terminado episodio {i} con retorno {returns} en {num_steps} pasos')
    return q_values

expected_sarsa_prueba = q_learning()

In [22]:
env.reset()
simulacion(expected_sarsa_prueba)

In [244]:
policy_aleatoria = np.zeros((20, 160, 40))
policy_aleatoria[:] = 1 / 40
acciones = np.linspace(-2,2,40)

def sample_policy(state, policy):

    probs = policy[state[0],state[1],:]
    r = np.random.random()

    p_acumulada = probs[0]
    i=0
    while p_acumulada<r:
        i+=1
        p_acumulada+=probs[i]
    return i


def greedify_policy(q_values):
    
    policy = np.zeros((20, 160, 40))

    for i in range(q_values.shape[0]):
      for j in range(q_values.shape[1]):
            v=q_values[i,j]
            policy[i,j,np.argmax(v)]=1.0
    return policy  



env.reset()
index = sample_policy(discretize_state(env.observation_space.sample()), policy_aleatoria)
print(f"Acción tomada: {acciones[index]}")

Acción tomada: -0.6666666666666667


In [None]:
def mc_control(num_pasos=5000, epsilon=0.2, GAMMA = 1):
    qsa = np.zeros((20, 160, 40, 40))
    policy = np.zeros((20, 160, 40, 40))
    # Partimos de la política aleatoria
    policy[:] = 1.0 / 40
    
    returns = []
    # Creamos la estructura
    for i in range(qsa.shape[0]):
      r_i = []
      for j in range(qsa.shape[1]):
        r_j = []
        for k in range(qsa.shape[2]):
          r_j.append([])
        r_i.append(r_j)
      returns.append(r_i)

    for i in range(num_pasos):


# Calculamos la política usando un epsilon de 0.4 para garantizar la exploración
# policy_montecarlo_es = mc_control(epsilon=0.4)

In [238]:
_, episode = simula_episodio(policy_aleatoria)

for state, action, reward in reversed(episode):
    print(state, action, reward)

[8, 89] 1.3846153846153846 -2.7855727920799125
[8, 89] -0.5641025641025641 -3.111376154206029
[8, 89] 1.1794871794871793 -3.7680767045305443
[7, 89] -0.6666666666666667 -4.574997171897949
[6, 88] 1.0769230769230766 -5.7805014737179325
[5, 88] 1.282051282051282 -7.307572252601721
[3, 87] -0.15384615384615397 -8.857240024737338
[1, 85] 0.35897435897435903 -10.576563379928077
[0, 83] 0.4615384615384617 -12.344516964143713
[0, 80] 1.1794871794871793 -12.153727684153866
[0, 77] 0.6666666666666665 -10.275170250795982
[0, 75] -1.282051282051282 -8.159801540396858
[2, 72] 1.0769230769230766 -6.6367326780786025
[4, 71] 0.35897435897435903 -5.217676635266168
[5, 70] 0.0512820512820511 -4.0364767856587624
[7, 69] -1.794871794871795 -3.026846590430093
[9, 69] -0.4615384615384617 -2.457763409410942
[9, 69] -0.4615384615384617 -2.1636985993674123
[10, 69] -0.15384615384615397 -2.121237153498062
[10, 69] 1.8974358974358974 -2.2485557088713546
[10, 69] -0.7692307692307692 -2.6074511826379947
[10, 69] 

In [27]:
# expected sarsa implementation

env.reset()
env.seed(123)
env._max_episode_steps = 600

def expected_sarsa(num_episodes = 4000, ALPHA = 0.1,  GAMMA = 1, EPSILON = 0.25):
    acciones = np.linspace(-2,2,40)
    q_values = np.zeros((40, 160, 40))

    for i in range(num_episodes):
        state = discretize_state(env.reset())
        returns, num_steps, done = 0, 0, False

        while not done:
            # env.render()
            a = get_epsilon_greedy_action(q_values[state[0], state[1]], EPSILON)
            num_steps += 1

            next_state, reward, done, _ = env.step([acciones[a]])
            next_state = discretize_state(next_state)
            returns -= reward

            a_prime = get_epsilon_greedy_action(q_values[next_state[0],next_state[1]], EPSILON)
            
            q_values[state[0], state[1], a] = q_values[state[0], state[1], a] + ALPHA * (reward + GAMMA * np.mean(q_values[next_state[0], next_state[1], a_prime]) \
                - q_values[state[0], state[1], a])
            state = next_state

        print(f'Terminado episodio {i} con retorno {returns} en {num_steps} pasos')
    return q_values

expected_q = expected_sarsa()
    

IndexError: index 24 is out of bounds for axis 0 with size 20

In [193]:
env.step([0.5])
print(env.state)
env.render()


[-3.41160696 -3.74963214]


True