# Reinforcement Learning, 2024-01
## Tarea 4 - Algoritmos de aprendizaje por refuerzo con aproximación de funciones

> Daniel Villar González, 201923374.  
> Daniel Alvarez, 201911320.

## **Descripción de tarea**

Considere el problema Cart Pole implementado en el entorno de Gymnasium descrito. El objetivo de este taller es comparar algoritmos de RL tabulares con sus contrapartes que utilizan aproximación de funciones.

### ***Requerimientos***
1. La recompensa vista en un estado terminal es cero.
2. El siguiente estado visto por un agente en un estado terminal es igual al mismo, cumpliendo con las dinámicas del MDP.
3. Para este caso, como existe aprendizaje, no se conocen de primera mano las probabilidades de transición, por lo que el agente debe aprender.
4. Existen varios episodios donde se actualiza la matriz Q, con el fin de conocer los mayores valores del par estado-acción.

## **Librerias**

In [1]:
import numpy as np
import random
import gym

In [11]:


def Random_games():
    env = gym.make("CartPole-v1")
    # Each of this episode is its own game.
    for episode in range(10):
        env.reset()
        total_reward = 0
        # this is each frame, up to 500...but we wont make it that far with random.
        for t in range(500):
            # This will display the environment
            # Only display if you really want to see it.
            # Takes much longer to display it.
            env.render()
            
            # This will just create a sample action in any environment.
            # In this environment, the action can be 0 or 1, which is left or right
            action = env.action_space.sample()

            # this executes the environment with an action, 
            # and returns the observation of the environment, 
            # the reward, if the env is over, and other info.
            #print(env.step(action))
            next_state, reward, done, info, _ = env.step(action)
            total_reward += reward
            # lets print everything in one line:
             # Print information
            print("Step:", t, " Action:", action, " Reward:", reward, "NextState:", next_state, " Done:", done, "Info:", info)
            
            if done:
                print("Episode", episode, "terminated after", t+1, "timesteps with total reward", total_reward)
                if total_reward>=500:
                    print("El episodio se ha completado de manera exitosa.")
                else:
                    print("El episodio no se completó exitosamente.")
                break  # Exit the loop if the episode is done
                
Random_games()

Step: 0  Action: 1  Reward: 1.0 NextState: [ 0.01518179  0.1978054   0.0426091  -0.2937135 ]  Done: False Info: False
Step: 1  Action: 1  Reward: 1.0 NextState: [ 0.01913789  0.3922948   0.03673483 -0.5726595 ]  Done: False Info: False
Step: 2  Action: 0  Reward: 1.0 NextState: [ 0.02698379  0.19667754  0.02528164 -0.268634  ]  Done: False Info: False
Step: 3  Action: 1  Reward: 1.0 NextState: [ 0.03091734  0.39142972  0.01990896 -0.5532369 ]  Done: False Info: False
Step: 4  Action: 1  Reward: 1.0 NextState: [ 0.03874593  0.5862665   0.00884422 -0.8395814 ]  Done: False Info: False
Step: 5  Action: 1  Reward: 1.0 NextState: [ 0.05047126  0.7812666  -0.0079474  -1.1294699 ]  Done: False Info: False
Step: 6  Action: 0  Reward: 1.0 NextState: [ 0.0660966   0.58624965 -0.0305368  -0.8392902 ]  Done: False Info: False
Step: 7  Action: 0  Reward: 1.0 NextState: [ 0.07782159  0.39155766 -0.0473226  -0.5563648 ]  Done: False Info: False
Step: 8  Action: 0  Reward: 1.0 NextState: [ 0.08565275 

  gym.logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


In [13]:
import numpy as np
import gym

class SarsaAgent:
    def __init__(self, num_states, num_actions, discount_factor=0.99, alpha=0.1, epsilon=0.1):
        self.q_table = np.zeros((num_states, num_actions))
        self.discount_factor = discount_factor
        self.alpha = alpha
        self.epsilon = epsilon
        self.epsilon_init = epsilon

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(len(self.q_table[state]))
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state, next_action):
        predict = self.q_table[state, action]
        target = reward + self.discount_factor * self.q_table[next_state, next_action]
        self.q_table[state, action] += self.alpha * (target - predict)

    def reset_epsilon(self):
        self.epsilon = self.epsilon_init

# Función para discretizar el espacio de estados
def discretize_state(state, num_states):
    num_bins = int(round(num_states ** (1/4)))  # Calcular el número de bins en base a la raíz cuarta del número de estados
    bins = np.array([np.linspace(-4.8, 4.8, num_bins),          # Car position
                     np.linspace(-5, 5, num_bins),              # Car velocity
                     np.linspace(-0.418, 0.418, num_bins),     # Pole angle
                     np.linspace(-5, 5, num_bins)])            # Pole velocity
    discretized_state = [np.digitize(s, bins[i]) - 1 for i, s in enumerate(state)]
    return tuple(discretized_state)

# Función para entrenar el agente SARSA
def train_sarsa(env, agent, num_episodes):
    for episode in range(num_episodes):
        state = env.reset()
        state = discretize_state(state, num_states)
        action = agent.choose_action(state)
        terminal = False
        while not terminal:
            next_state, reward, terminal, _ = env.step(action)
            next_state = discretize_state(next_state, num_states)
            next_action = agent.choose_action(next_state)
            agent.update_q_table(state, action, reward, next_state, next_action)
            state = next_state
            action = next_action
        agent.reset_epsilon()

# Función para obtener la política del agente
def get_policy(q_table):
    return np.argmax(q_table, axis=1)

# Crear el entorno CartPole-v1
env = gym.make("CartPole-v1")

# Definir el agente SARSA
num_states = 300
num_actions = env.action_space.n
sarsa_agent = SarsaAgent(num_states, num_actions)

# Entrenar el agente SARSA
train_sarsa(env, sarsa_agent, num_episodes=10000)

# Obtener la política del agente
policy = get_policy(sarsa_agent.q_table)
print("Política aprendida:", policy)


TypeError: '<' not supported between instances of 'dict' and 'dict'