### Instalamos e importamos tanto las librerías como el FrozenLake v0

In [None]:
!pip install gym==0.17.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gym
import numpy as np
import random as rd
from IPython.display import clear_output

env = gym.make('FrozenLake-v0', desc=None, map_name="4x4", is_slippery=False)

### Visualizamos el espacio de 4x4 y ejecutamos la tabla de recompensas

In [None]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [None]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(4)
State Space Discrete(16)


In [None]:
#Tabla de recompensa

env.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

### Definimos la posición de salida, que será desde el mismo lugar

In [None]:
state = env.s
print("State:", state)  #En este caso siempre empezará de la posición zero

State: 0


In [None]:
#Movimientos desde la posición 0

env.P[0]

{0: [(1.0, 0, 0.0, False)],
 1: [(1.0, 4, 0.0, False)],
 2: [(1.0, 1, 0.0, False)],
 3: [(1.0, 0, 0.0, False)]}

##Action Space:



*   0: Left
*   1: Down
*   2: Right
*   3: Up

##Rewards:

*   Reach goal(G): +1
*   Reach hole(H): 0
*   Reach frozen(F): 0

##Definimos tabla Q

In [None]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [None]:
q_table[0]

array([0., 0., 0., 0.])

##Definimos greedy policy

In [None]:
def greedy(epsilon,q_table,state,env):
    if rd.random() < epsilon:
        action=env.action_space.sample() #explorar
    else:
        action=np.argmax(q_table[state]) #explotar
    return action

### Entrenamos y ejecutamos el programa

In [None]:
# Definimos los Hyperparameters

alpha = 0.7 # tasa de aprendizaje
gamma = 0.95 # tasa de descuento
epsilon = 1.0 # greedy policy

# Generamos las listas vacías para agrupar los resultados
all_timestep = []
all_penalties = []

# Definimos la cantidad de episodios que realizaremos
episodes = 1001

for i in range(episodes):
    state = env.reset()

    timestep, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = greedy(epsilon,q_table,state,env) # aplicamos la greedy policy

        next_state, reward, done, info = env.step(action) # tomamos la acción elegida

        old_value = q_table[state, action] # en la Q-table, tomamos el valor Q de la acción elegida para el estado actual
        next_max = np.max(q_table[next_state]) # en la Q-table, tomamos el máximo entre los valores Q para el nuevo estado

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max) # actualizamos el valor Q
        q_table[state, action] = new_value

        if reward == 0:
            penalties += 1

        state = next_state
        timestep += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 1000
Training finished.



In [None]:
env.s
env.render()

  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [None]:
q_table #Visualizamos los valores del Q_table

array([[0.73509164, 0.77378067, 0.77378053, 0.73509164],
       [0.73509164, 0.        , 0.81450582, 0.77378051],
       [0.77378032, 0.85737469, 0.7737798 , 0.81450563],
       [0.81450506, 0.        , 0.77377803, 0.77377975],
       [0.77378062, 0.81450597, 0.        , 0.73509161],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.90249971, 0.        , 0.8145026 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450594, 0.        , 0.85737471, 0.77378064],
       [0.81450578, 0.9024978 , 0.90249971, 0.        ],
       [0.85737469, 0.9499998 , 0.        , 0.85735857],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.89605336, 0.94999835, 0.8573433 ],
       [0.90236487, 0.94983622, 0.99999995, 0.90238926],
       [0.        , 0.        , 0.        , 0.        ]])

In [None]:
class bcolors:
    RED= '\u001b[31m'
    GREEN= '\u001b[32m'
    RESET= '\u001b[0m'

env.s = 0
state = env.reset()
done = False

timestep, penalties, reward = 0, 0, 0
total_reward = 0

while not done:

  action = np.argmax(q_table[state])
  state, reward, done, info = env.step(action) # con "step" realizamos la acción elegida

  if reward == 0:
      penalties += 1 # sumamos una penalización si el taxi intenta dejar al pasajero cuando aún no está a bordo

  timestep += 1
  total_reward += reward

  # Print each step
  clear_output(wait=True)
  env.render()
  print("")
  if reward == 0:
    print(f"Recompensa actual: {bcolors.RED}{reward}{bcolors.RESET}")
  else:
    print(f"Recompensa actual: {bcolors.GREEN}{reward}{bcolors.RESET}")
  if reward == 0:
    print(f"Recompensa total: {bcolors.RED}{total_reward}{bcolors.RESET}")
  else:
    print(f"Recompensa total: {bcolors.GREEN}{total_reward}{bcolors.RESET}")
  print("")
  print('Estado actual', state)

print("Timesteps taken: {}".format(timestep))
print("Penalties incurred: {}".format(penalties))

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m

Recompensa actual: [32m1.0[0m
Recompensa total: [32m1.0[0m

Estado actual 15
Timesteps taken: 6
Penalties incurred: 5
