In [None]:
from create_env_gs import (make_env, run_ql_search, run_pi_search, run_vi_search)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gymnasium.spaces import Tuple, Discrete

seed = 123
np.random.seed(seed)

# make environments
mdp_fl_L = 'FrozenLake8x8-v1'
size_fl_L = 16
is_slippery_fl_L = True
render_mode_fl_L = 'ansi'
prob_frozen_fl_L = 0.9
ep_steps_fl_L = 400
frozenlakeL = make_env(mdp=mdp_fl_L, 
                      size=size_fl_L, 
                      slip=is_slippery_fl_L,
                      render=render_mode_fl_L,
                      prob_frozen=prob_frozen_fl_L,
                      seed=seed,
                      ep_steps=ep_steps_fl_L)

mdp_fl_S = 'FrozenLake-v1'
size_fl_S = 4
is_slippery_fl_S = True
render_mode_fl_S = 'ansi'
prob_frozen_fl_S = 0.9
ep_steps_fl_S = 100
frozenlakeS = make_env(mdp=mdp_fl_S, 
                      size=size_fl_S, 
                      slip=is_slippery_fl_S,
                      render=render_mode_fl_S,
                      prob_frozen=prob_frozen_fl_S,
                      seed=seed,
                      ep_steps=ep_steps_fl_S)


mdp_bj = 'Blackjack-v1'
render_mode_bj = None #'rgb_array'
size_bj = Tuple([Discrete(32), Discrete(11), Discrete(2)])
ep_steps_bj = 100
blackjack = make_env(mdp=mdp_bj,
                     size=size_bj,
                     slip=None,
                     render=render_mode_bj,
                     seed=seed,
                     prob_frozen=None,
                     ep_steps=ep_steps_bj)

# FL 16x16
# QL
iters_ql_fl_L = 2000
gamma_ql_fl_L = [0.90, 0.99, 0.999]
epsilon_decay_ql_fl_L = [0.80, 0.90, 0.99]
init_alpha_ql_fl_L = [0.30, 0.50, 0.70]

# PI
iters_pi_fl_L = 1000
gamma_pi_fl_L = [0.90, 0.99, 0.999]
theta_pi_fl_L = [1e-5, 1e-7, 1e-9]

# VI
iters_vi_fl_L = 1000
gamma_vi_fl_L = [0.90, 0.99, 0.999]
theta_vi_fl_L = [1e-5, 1e-7, 1e-9]

# FL 4x4
# QL
iters_ql_fl_S = 2000
gamma_ql_fl_S = [0.90, 0.99, 0.999]
epsilon_decay_ql_fl_S = [0.80, 0.90, 0.99]
init_alpha_ql_fl_S = [0.30, 0.50, 0.70]

# PI
iters_pi_fl_S = 1000
gamma_pi_fl_S = [0.90, 0.99, 0.999]
theta_pi_fl_S = [1e-5, 1e-7, 1e-9]

# VI
iters_vi_fl_S = 1000
gamma_vi_fl_S = [0.90, 0.99, 0.999]
theta_vi_fl_S = [1e-5, 1e-7, 1e-9]

# BJ
# QL
iters_ql_bj = 2000
gamma_ql_bj = [0.90, 0.99, 0.999]
epsilon_decay_ql_bj_edr = [0.90, 0.99, 0.999]
init_alpha_ql_bj = [0.30, 0.50, 0.70]

# PI
iters_pi_bj = 1000
gamma_pi_bj = [0.90, 0.99, 0.999]
theta_pi_bj = [1e-5, 1e-7, 1e-9]

# VI
iters_vi_bj = 1000
gamma_vi_bj = [0.90, 0.99, 0.999]
theta_vi_bj = [1e-5, 1e-7, 1e-9]

$$
\textbf{Frozen Lake 16x16}\\~\\
\textbf{Q Learning}\\
\textbf{Gamma}
$$

In [None]:
ql_fl_L_gamma = run_ql_search(process=frozenlakeL,
                        gamma=gamma_ql_fl_L,
                        n_episodes=iters_ql_fl_L)
ql_fl_L_gamma.keys()

$$
\textbf{Q Learning}\\
\textbf{Epsilon Decay Ratio}
$$

In [None]:
ql_fl_L_edr = run_ql_search(process=frozenlakeL,
                        epsilon_decay_ratio=epsilon_decay_ql_fl_L,
                        n_episodes=iters_ql_fl_L)
ql_fl_L_edr.keys()

$$
\textbf{Q Learning}\\
\textbf{Init Alpha}
$$

In [None]:
ql_fl_L_alpha = run_ql_search(process=frozenlakeL,
                        init_alpha=init_alpha_ql_fl_L,
                        n_episodes=iters_ql_fl_L)
ql_fl_L_alpha.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Gamma}
$$

In [None]:
fl_L_pi_gam = run_pi_search(process=frozenlakeL,
                  gamma=gamma_pi_fl_L,
                  n_iters=iters_pi_fl_L)
fl_L_pi_gam.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Theta}
$$

In [None]:
fl_L_pi_theta = run_pi_search(process=frozenlakeL,
                  theta=theta_pi_fl_L,
                  n_iters=iters_pi_fl_L)
fl_L_pi_theta.keys()

$$
\textbf{Value Iteration}\\
\textbf{Gamma}
$$

In [None]:
fl_L_vi_gam = run_vi_search(process=frozenlakeL,
                  gamma=gamma_vi_fl_L,
                  n_iters=iters_vi_fl_L)
fl_L_vi_gam.keys()

$$
\textbf{Value Iteration}\\
\textbf{Theta}
$$

In [None]:
fl_L_vi_theta = run_vi_search(process=frozenlakeL,
                  theta=theta_vi_fl_L,
                  n_iters=iters_vi_fl_L)
fl_L_vi_theta.keys()

$$
\textbf{Frozen Lake 4x4}\\~\\
\textbf{Q Learning}\\
\textbf{Gamma}
$$

In [None]:
ql_fl_S_gamma = run_ql_search(process=frozenlakeS,
                        gamma=gamma_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_gamma.keys()

$$
\textbf{Q Learning}\\
\textbf{Epsilon Decay Ratio}
$$

In [None]:
ql_fl_S_edr = run_ql_search(process=frozenlakeS,
                        epsilon_decay_ratio=epsilon_decay_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_edr.keys()

$$
\textbf{Q Learning}\\
\textbf{Init Alpha}
$$

In [None]:
ql_fl_S_alpha = run_ql_search(process=frozenlakeS,
                        init_alpha=init_alpha_ql_fl_S,
                        n_episodes=iters_ql_fl_S)
ql_fl_S_alpha.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Gamma}
$$

In [None]:
fl_S_pi_gam = run_pi_search(process=frozenlakeS,
                  gamma=gamma_pi_fl_S,
                  n_iters=iters_pi_fl_S)
fl_S_pi_gam.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Theta}
$$

In [None]:
fl_S_pi_theta = run_pi_search(process=frozenlakeS,
                  theta=theta_pi_fl_S,
                  n_iters=iters_pi_fl_S)
fl_S_pi_theta.keys()

$$
\textbf{Value Iteration}\\
\textbf{Gamma}
$$

In [None]:
fl_S_vi_gam = run_vi_search(process=frozenlakeS,
                  gamma=gamma_vi_fl_S,
                  n_iters=iters_vi_fl_S)
fl_S_vi_gam.keys()

$$
\textbf{Value Iteration}\\
\textbf{Theta}
$$

In [None]:
fl_S_vi_theta = run_vi_search(process=frozenlakeS,
                  theta=theta_vi_fl_S,
                  n_iters=iters_vi_fl_S)
fl_S_vi_theta.keys()

$$
\textbf{Blackjack}\\~\\
\textbf{Q Learning}\\
\textbf{Gamma}
$$

In [None]:
q_bj_gamma = run_ql_search(process=blackjack,
                        gamma=gamma_ql_bj,
                        n_episodes=iters_ql_bj)
q_bj_gamma.keys()

$$
\textbf{Q Learning}\\
\textbf{Epsilon Decay Ratio}
$$

In [None]:
ql_bj_edr = run_ql_search(process=blackjack,
                        epsilon_decay_ratio=epsilon_decay_ql_bj_edr,
                        n_episodes=iters_ql_bj)
ql_bj_edr.keys()

$$
\textbf{Q Learning}\\
\textbf{Init Alpha}
$$

In [None]:
ql_bj_alpha = run_ql_search(process=blackjack,
                        init_alpha=init_alpha_ql_bj,
                        n_episodes=iters_ql_bj)
ql_bj_alpha.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Gamma}
$$

In [None]:
bj_pi_gamma = run_pi_search(process=blackjack,
                  gamma=gamma_pi_bj,
                  n_iters=iters_pi_bj)
bj_pi_gamma.keys()

$$
\textbf{Policy Iteration}\\
\textbf{Theta}
$$

In [None]:
bj_pi_theta = run_pi_search(process=blackjack,
                  theta=theta_pi_bj,
                  n_iters=iters_pi_bj)
bj_pi_theta.keys()

$$
\textbf{Value Iteration}\\
\textbf{Gamma}
$$

In [None]:
bj_vi_gamma = run_vi_search(process=blackjack,
                  gamma=gamma_vi_bj,
                  n_iters=iters_vi_bj)
bj_vi_gamma.keys()

$$
\textbf{Value Iteration}\\
\textbf{Theta}
$$

In [None]:
bj_vi_theta = run_vi_search(process=blackjack,
                  theta=theta_vi_bj,
                  n_iters=iters_vi_bj)
bj_vi_theta.keys()