In [4]:
# 1. Introducción a los principales algoritmos de RL:
# Define el entorno del juego
class Environment:
  def __init__(self):
    self.state_space = [0, 1, 2, 3]
    self.action_space = [0, 1]
    self.rewards = {0: -1, 1: -1, 2: -1, 3: -10}

# Crea una instancia del entorno
env = Environment()

# Muestra información del entorno
print("Estados posibles: ", env.state_space)
print("Acciones posibles: ", env.action_space)
print("Recompensas: ", env.rewards)

Estados posibles:  [0, 1, 2, 3]
Acciones posibles:  [0, 1]
Recompensas:  {0: -1, 1: -1, 2: -1, 3: -10}


In [6]:
# 2. Q-Learning:
import numpy as np

# Inicializa la tabla Q con valores arbitriarios
Q = np.zeros((len(env.state_space), len(env.action_space)))

# Define los parametros del algoritmo
alpha = 0.1
gamma = 0.9
epsilon = 0.1
# Entrena el agente utilizando Q-learning
for _ in range(1000):
  state = np.random.choice(env.state_space)
  while state != 3:
    action = np.random.choice(env.action_space)
    next_state = state + action
    reward = env.rewards[next_state]
    Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
    state = next_state

# Muestra la función Q-valor aprendida
print("Función Q-valor aprendida: ")
print(Q)

Función Q-valor aprendida: 
[[ -8.73860796  -9.96879051]
 [ -9.97085492  -9.99334293]
 [ -9.99386377 -10.        ]
 [  0.           0.        ]]


In [7]:
# 3. Sarsa
# Reinicia la tabla Q con valores arbitriarios
Q = np.zeros((len(env.state_space), len(env.action_space)))

# Entrena el agente usando Sarsa
for _ in range(1000):
  state = np.random.choice(env.state_space)
  action = np.random.choice(env.action_space)
  while state != 3:
    next_state = state + action
    next_action = np.random.choice(env.action_space)
    reward = env.rewards[next_state]
    Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
    state = next_state
    action = next_action

# Muestra la función Q-valor aprendida
print("Función Q-valor aprendida: ")
print(Q)

Función Q-valor aprendida: 
[[ -9.99995077  -9.99999999]
 [-10.         -10.        ]
 [-10.         -10.        ]
 [  0.           0.        ]]


In [15]:
# 4. Política de Gradiente de Montecarlo
import numpy as np
# Crea una instancia del entorno
env = Environment()
# Inicia la política con probabilidades uniformes
policy = np.ones((len(env.state_space), len(env.action_space))) / len(env.action_space)

max_steps = 1000  # Define el límite máximo de pasos
steps = 0

# Define la función de recompensa promedio
def average_reward(Q):
  return np.mean([Q[state, np.argmax(policy[state])] for state in env.state_space])

# Entrena la política utilizando Gradiente de Montecarlo
for _ in range(1000):
  state = np.random.choice(env.state_space)
  while state != 3 and steps < max_steps:
    action = np.random.choice(env.action_space, p=policy[state])
    next_state = state + action
    reward = env.rewards[next_state]
    gradient = np.zeros_like(policy[state])
    gradient[action] = 1
    alpha = 0.01
    policy[state] += alpha * gradient * (reward - average_reward(Q))
    steps += 1
    # Normalize probabilities to sum to 1
    policy[state] /= np.sum(policy[state]) # Add this line to normalize probabilities
    state = next_state

# Muestra la política aprendida
print("Política aprendida: ")
print(policy)

Política aprendida: 
[[4.17391207e-01 5.82608793e-01]
 [6.66124302e-01 3.33875698e-01]
 [1.00000000e+00 1.74474133e-27]
 [5.00000000e-01 5.00000000e-01]]
