In [1]:
from create_env_gs import (make_env, run_ql_search, run_pi_search, run_vi_search)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gymnasium.spaces import Tuple, Discrete

seed = 123
np.random.seed(seed)

# make environments
mdp_bj = 'Blackjack-v1'
render_mode_bj = None #'rgb_array'
size_bj = Tuple([Discrete(32), Discrete(11), Discrete(2)])
ep_steps_bj = 100
blackjack = make_env(mdp=mdp_bj,
                     size=size_bj,
                     slip=None,
                     render=render_mode_bj,
                     seed=seed,
                     prob_frozen=None,
                     ep_steps=ep_steps_bj)

# BJ
# QL
iters_ql_bj = 100000
gamma_ql_bj = [0.90, 0.99, 0.999]
epsilon_decay_ql_bj_edr = [0.90, 0.99, 0.999]
init_alpha_ql_bj = [0.30, 0.50, 0.70]

# PI
iters_pi_bj = 100000
gamma_pi_bj = [0.90, 0.99, 0.999]
theta_pi_bj = [1e-5, 1e-7, 1e-9]

# VI
iters_vi_bj = 100000
gamma_vi_bj = [0.90, 0.99, 0.999]
theta_vi_bj = [1e-5, 1e-7, 1e-9]

$$
\textbf{Blackjack}\\~\\
\textbf{Q Learning}\\
\textbf{Gamma}
$$

In [2]:
q_bj_gamma = run_ql_search(process=blackjack,
                        gamma=gamma_ql_bj,
                        n_episodes=iters_ql_bj)
q_bj_gamma.keys()

q_learning: gamma=0.9; edr=0.9; ialpha=0.5; episodes=100000


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 13.88 seconds
Avg. episode reward:  -0.04967
###################

q_learning: gamma=0.99; edr=0.9; ialpha=0.5; episodes=100000
runtime = 13.92 seconds
Avg. episode reward:  -0.0528
###################

q_learning: gamma=0.999; edr=0.9; ialpha=0.5; episodes=100000
runtime = 13.79 seconds
Avg. episode reward:  -0.05934
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Q Learning}\\
\textbf{Epsilon Decay Ratio}
$$

In [3]:
ql_bj_edr = run_ql_search(process=blackjack,
                        epsilon_decay_ratio=epsilon_decay_ql_bj_edr,
                        n_episodes=iters_ql_bj)
ql_bj_edr.keys()

q_learning: epsilon_decay_ratio=0.9; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 13.49 seconds
Avg. episode reward:  -0.0528
###################

q_learning: epsilon_decay_ratio=0.99; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 13.44 seconds
Avg. episode reward:  -0.04956
###################

q_learning: epsilon_decay_ratio=0.999; gamma=0.99; ialpha=0.5; episodes=100000
runtime = 13.45 seconds
Avg. episode reward:  -0.0527
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Q Learning}\\
\textbf{Init Alpha}
$$

In [4]:
ql_bj_alpha = run_ql_search(process=blackjack,
                        init_alpha=init_alpha_ql_bj,
                        n_episodes=iters_ql_bj)
ql_bj_alpha.keys()

q_learning: init_alpha=0.3; gamma=0.99; edr=0.9; episodes=100000
runtime = 13.39 seconds
Avg. episode reward:  -0.0568
###################

q_learning: init_alpha=0.5; gamma=0.99; edr=0.9; episodes=100000
runtime = 13.27 seconds
Avg. episode reward:  -0.0528
###################

q_learning: init_alpha=0.7; gamma=0.99; edr=0.9; episodes=100000
runtime = 13.14 seconds
Avg. episode reward:  -0.04952
###################



dict_keys([0.3, 0.5, 0.7])

$$
\textbf{Policy Iteration}\\
\textbf{Gamma}
$$

In [5]:
bj_pi_gamma = run_pi_search(process=blackjack,
                  gamma=gamma_pi_bj,
                  n_iters=iters_pi_bj)
bj_pi_gamma.keys()

PI: gamma=0.9; theta=1e-10; iters=100000
runtime = 0.04 seconds
Avg. episode reward:  -0.0492
###################

PI: gamma=0.99; theta=1e-10; iters=100000
runtime = 0.04 seconds
Avg. episode reward:  -0.04524
###################

PI: gamma=0.999; theta=1e-10; iters=100000
runtime = 0.04 seconds
Avg. episode reward:  -0.04524
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Policy Iteration}\\
\textbf{Theta}
$$

In [6]:
bj_pi_theta = run_pi_search(process=blackjack,
                  theta=theta_pi_bj,
                  n_iters=iters_pi_bj)
bj_pi_theta.keys()

PI: theta=1e-05; gamma=1.0; iters=100000
runtime = 0.05 seconds
Avg. episode reward:  -0.04524
###################

PI: theta=1e-07; gamma=1.0; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  -0.04524
###################

PI: theta=1e-09; gamma=1.0; iters=100000
runtime = 0.04 seconds
Avg. episode reward:  -0.04524
###################



dict_keys([1e-05, 1e-07, 1e-09])

$$
\textbf{Value Iteration}\\
\textbf{Gamma}
$$

In [7]:
bj_vi_gamma = run_vi_search(process=blackjack,
                  gamma=gamma_vi_bj,
                  n_iters=iters_vi_bj)
bj_vi_gamma.keys()

VI: gamma=0.9; theta=1e-10; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  -0.0492
###################

VI: gamma=0.99; theta=1e-10; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  -0.04524
###################

VI: gamma=0.999; theta=1e-10; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  -0.04524
###################



dict_keys([0.9, 0.99, 0.999])

$$
\textbf{Value Iteration}\\
\textbf{Theta}
$$

In [8]:
bj_vi_theta = run_vi_search(process=blackjack,
                  theta=theta_vi_bj,
                  n_iters=iters_vi_bj)
bj_vi_theta.keys()

VI: theta=1e-05; gamma=1.0; iters=100000
runtime = 0.03 seconds
Avg. episode reward:  -0.04524
###################

VI: theta=1e-07; gamma=1.0; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  -0.04524
###################

VI: theta=1e-09; gamma=1.0; iters=100000
runtime = 0.02 seconds
Avg. episode reward:  -0.04524
###################



dict_keys([1e-05, 1e-07, 1e-09])