In [1]:
from create_env_gs import (make_env, run_ql_search, run_pi_search, run_vi_search)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gymnasium.spaces import Tuple, Discrete
from bettermdptools.utils.plots import Plots

seed = 123
np.random.seed(seed)

# make environments
mdp_fl_S = 'FrozenLake-v1'
size_fl_S = 4
is_slippery_fl_S = True
render_mode_fl_S = 'ansi'
prob_frozen_fl_S = 0.9
ep_steps_fl_S = 100
frozenlakeS = make_env(mdp=mdp_fl_S, 
                      size=size_fl_S, 
                      slip=is_slippery_fl_S,
                      render=render_mode_fl_S,
                      prob_frozen=prob_frozen_fl_S,
                      seed=seed,
                      ep_steps=ep_steps_fl_S)

# FL 4x4
# QL
iters_ql_fl_S = 100000
gamma_ql_fl_S = [0.90, 0.99, 0.999]
epsilon_decay_ql_fl_S = [0.80, 0.90, 0.99]
init_alpha_ql_fl_S = [0.30, 0.50, 0.70]

# PI
iters_pi_fl_S = 100000
gamma_pi_fl_S = [0.90, 0.99, 0.999]
theta_pi_fl_S = [1e-5, 1e-7, 1e-9]

# VI
iters_vi_fl_S = 100000
gamma_vi_fl_S = [0.90, 0.99, 0.999]
theta_vi_fl_S = [1e-5, 1e-7, 1e-9]


[41mS[0mFFF
FFHF
FFFF
FFFG



$$
\textbf{Frozen Lake 4x4}\\~\\
\textbf{Q Learning}\\
\textbf{Gamma}
$$

In [2]:
ql_fl_S_gamma = run_ql_search(process=frozenlakeS,
                        gamma=gamma_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_gamma.keys()

q_learning: gamma=0.9; edr=0.9; ialpha=0.5; episodes=100000


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 39.61 seconds
Avg. episode reward:  0.9989654534999999
###################

q_learning: gamma=0.99; edr=0.9; ialpha=0.5; episodes=100000
runtime = 38.16 seconds
Avg. episode reward:  0.9988951035
###################

q_learning: gamma=0.999; edr=0.9; ialpha=0.5; episodes=100000
runtime = 45.38 seconds
Avg. episode reward:  0.9986743170000002
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Q Learning}\\
\textbf{Epsilon Decay Ratio}
$$

In [3]:
ql_fl_S_edr = run_ql_search(process=frozenlakeS,
                        epsilon_decay_ratio=epsilon_decay_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_edr.keys()

q_learning: epsilon_decay_ratio=0.8; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 37.82 seconds
Avg. episode reward:  0.9989654570000001
###################

q_learning: epsilon_decay_ratio=0.9; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 37.83 seconds
Avg. episode reward:  0.9988951035
###################

q_learning: epsilon_decay_ratio=0.99; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 38.76 seconds
Avg. episode reward:  0.9989652690000002
###################



dict_keys([0.8, 0.9, 0.99])

$$
\textbf{Q Learning}\\
\textbf{Init Alpha}
$$

In [4]:
ql_fl_S_alpha = run_ql_search(process=frozenlakeS,
                        init_alpha=init_alpha_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_alpha.keys()

q_learning: init_alpha=0.3; gamma=0.99; edr=0.9; episodes=100000
runtime = 37.57 seconds
Avg. episode reward:  0.9988950645
###################

q_learning: init_alpha=0.5; gamma=0.99; edr=0.9; episodes=100000
runtime = 33.88 seconds
Avg. episode reward:  0.9988951035
###################

q_learning: init_alpha=0.7; gamma=0.99; edr=0.9; episodes=100000
runtime = 37.02 seconds
Avg. episode reward:  0.9989651335
###################



dict_keys([0.3, 0.5, 0.7])

$$
\textbf{Policy Iteration}\\
\textbf{Gamma}
$$

In [5]:
fl_S_pi_gam = run_pi_search(process=frozenlakeS,
                  gamma=gamma_pi_fl_S,
                  n_iters=iters_pi_fl_S)
fl_S_pi_gam.keys()

PI: gamma=0.9; theta=1e-10; iters=100000
runtime = 0.01 seconds
Avg. episode reward:  0.99894572
###################

PI: gamma=0.99; theta=1e-10; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  0.99894572
###################

PI: gamma=0.999; theta=1e-10; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  0.99894572
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Policy Iteration}\\
\textbf{Theta}
$$

In [6]:
fl_S_pi_theta = run_pi_search(process=frozenlakeS,
                  theta=theta_pi_fl_S,
                  n_iters=iters_pi_fl_S)
fl_S_pi_theta.keys()

PI: theta=1e-05; gamma=1.0; iters=100000
runtime = 0.01 seconds
Avg. episode reward:  0.9990088735
###################

PI: theta=1e-07; gamma=1.0; iters=100000
runtime = 0.01 seconds
Avg. episode reward:  0.9990088735
###################

PI: theta=1e-09; gamma=1.0; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  0.9990088735
###################



dict_keys([1e-05, 1e-07, 1e-09])

$$
\textbf{Value Iteration}\\
\textbf{Gamma}
$$

In [7]:
fl_S_vi_gam = run_vi_search(process=frozenlakeS,
                  gamma=gamma_vi_fl_S,
                  n_iters=iters_vi_fl_S)
fl_S_vi_gam.keys()

VI: gamma=0.9; theta=1e-10; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  0.99894572
###################

VI: gamma=0.99; theta=1e-10; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  0.99894572
###################

VI: gamma=0.999; theta=1e-10; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  0.99894572
###################



dict_keys([0.9, 0.99, 0.999])

In [8]:
V_vi_gamma = fl_S_vi_gam[0.9]['V']
pi_vi_gamma = fl_S_vi_gam[0.9]['pi']
v_track_vi_gamma = fl_S_vi_gam[0.9]['vi_track']
print(V_vi_gamma.shape, pi_vi_gamma, v_track_vi_gamma.shape)

(16,) {0: 1, 1: 2, 2: 3, 3: 2, 4: 1, 5: 0, 6: 0, 7: 2, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 0} (100000, 16)


In [9]:
# size=(4,4)
# Plots.values_heat_map(V_vi_gamma, "Frozen Lake\nValue Iteration State Values", size)

In [10]:
# max_value_per_iter = np.trim_zeros(np.mean(v_track_vi_gamma, axis=1), 'b')
# Plots.v_iters_plot(max_value_per_iter, "Frozen Lake\nMean Value v Iterations")

In [11]:
# fl_actions = {0: "←", 1: "↓", 2: "→", 3: "↑"}
# fl_map_size=(4, 4)
# title="FL Mapped Policy\nArrows represent best action"
# val_max, policy_map = Plots.get_policy_map(pi_vi_gamma, V_vi_gamma, fl_actions, fl_map_size)
# Plots.plot_policy(val_max, policy_map, fl_map_size, title)


$$
\textbf{Value Iteration}\\
\textbf{Theta}
$$

In [12]:
fl_S_vi_theta = run_vi_search(process=frozenlakeS,
                  theta=theta_vi_fl_S,
                  n_iters=iters_vi_fl_S)
fl_S_vi_theta.keys()

VI: theta=1e-05; gamma=1.0; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  0.9990088735
###################

VI: theta=1e-07; gamma=1.0; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  0.9990088735
###################

VI: theta=1e-09; gamma=1.0; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  0.9990088735
###################



dict_keys([1e-05, 1e-07, 1e-09])