In [1]:
import typing as tp
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots


cards = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10]


def player_policy_func(diler_score: int, player_score: int, player_has_ace: bool) -> int:
    if player_has_ace:
        if player_score < 20:
            return 1
    else:
        if player_score < 16:
            return 1
    return 0

policy_data_wo_ace = np.zeros((10, 10), dtype=int)
for i in range(10):
    for j in range(10):
        policy_data_wo_ace[i, j] = player_policy_func(i + 1, j + 12, False)
policy_data_with_ace = np.zeros((10, 10), dtype=int)
for i in range(10):
    for j in range(10):
        policy_data_with_ace[i, j] = player_policy_func(i + 1, j + 12, True)


def run_blackjack(dealer_score: int, player_score: int, player_has_ace: bool,
                  start_step: int = 0) -> tp.Tuple[float, int]:
    if player_has_ace:
        action = policy_data_with_ace[dealer_score - 1, player_score - 12]
    else:
        action = policy_data_wo_ace[dealer_score - 1, player_score - 12]
    if action > 0:
        rand_card = np.random.choice(cards)
        if rand_card == 1:
            if player_score > 10:
                player_score += 1
            else:
                player_score += 11
                player_has_ace = True
    if player_score > 21:
        if player_has_ace:
            player_score -= 10
            player_has_ace = False
            return run_blackjack(dealer_score, player_score, False, start_step+1)
        return -1, start_step
    if dealer_score == 1:
        dealer_score += 10
        diler_has_ace = True
    else:
        diler_has_ace = False
    while dealer_score < 17:
        rand_card = np.random.choice(cards)
        if rand_card == 1:
            if diler_has_ace:
                if dealer_score > 10:
                    dealer_score += 1
                else:
                    dealer_score += 11
                    diler_has_ace = True
        else:
            dealer_score += rand_card
        if dealer_score > 21:
            if diler_has_ace:
                dealer_score -= 10
                diler_has_ace = False
            else:
                return 1, start_step + 1
    if dealer_score > player_score:
        return -1, start_step + 1
    if dealer_score == player_score:
        return 0, start_step + 1
    return 1, start_step + 1


def sample_rewards() -> tp.Tuple[np.array, np.array]:
    rewards_wo_ace = np.zeros((10, 10), dtype=float)
    for i in range(10):
        for j in range(10):
            rewards_wo_ace[i, j], _ = run_blackjack(i + 1, j + 12, False)
    rewards_with_ace = np.zeros((10, 10), dtype=float)
    for i in range(10):
        for j in range(10):
            rewards_with_ace[i, j], _ = run_blackjack(i + 1, j + 12, True)
    return rewards_wo_ace, rewards_with_ace

value_data_count = np.ones((10, 10), dtype=int)
value_data_wo_ace, value_data_with_ace = sample_rewards()
# value_data_wo_ace, value_data_with_ace = np.zeros((10, 10), dtype=float), np.zeros((10, 10), dtype=float)

def refine_value_iteration(delta_factor: float):
    global value_data_count, value_data_wo_ace, value_data_with_ace
    
    s_value_data_wo_ace, s_value_data_with_ace = sample_rewards()
    next_value_data_count = value_data_count + 1
    
    inv_next_count = 1 / next_value_data_count.astype(float)
    prev_ratio = value_data_count.astype(float) * inv_next_count
    
    value_data_wo_ace = value_data_wo_ace * prev_ratio + s_value_data_wo_ace * inv_next_count
    value_data_with_ace = value_data_wo_ace * prev_ratio + s_value_data_with_ace * inv_next_count
    
    value_data_count = next_value_data_count
    
    # value_data_wo_ace = value_data_wo_ace + (s_value_data_wo_ace - value_data_wo_ace) * delta_factor
    # value_data_with_ace = value_data_with_ace + (s_value_data_with_ace - value_data_with_ace) * delta_factor
            

In [2]:
from tqdm import tqdm

def estimate_policy(n_experiments: int = 1_000_000) -> float:
    sum_reward = 0
    for _ in tqdm(range(n_experiments)):
        players_cards = [np.random.choice(cards), np.random.choice(cards)]
        player_has_ace = False
        if players_cards[0] == 1:
            player_has_ace = True
            player_score = 11 + players_cards[1]
        elif players_cards[1] == 1:
            player_has_ace = True
            player_score = 11 + players_cards[0]
        else:
            player_score = players_cards[0] + players_cards[1]
        dealer_score = np.random.choice(cards)
        reward, _ = run_blackjack(dealer_score, player_score, player_has_ace)
        sum_reward += reward
    return sum_reward / n_experiments

default_policy_reward = estimate_policy()
default_policy_reward

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [01:06<00:00, 15095.40it/s]


-0.194249

In [3]:
from tqdm import tqdm
from pathlib import Path
import shutil
import imageio

temp_dir = Path('tmp')
if temp_dir.exists():
    shutil.rmtree(temp_dir)
temp_dir.mkdir()

camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=-1.25, y=1.25, z=1.25)
)

diler_axis = np.linspace(1, 11, 10)
player_axis = np.linspace(12, 21, 10)

def draw_value_function(title: str, values_wo_ace: np.ndarray, values_with_ace: np.ndarray) -> go.Figure:
    fig = make_subplots(
        rows=1, cols=2,
        shared_xaxes=False,
        specs=[[{'type': 'surface'}, {'type': 'surface'}]],
        subplot_titles=("No ace", "Ace"))

    fig.add_trace(go.Surface(x=diler_axis, y=player_axis, z=values_wo_ace, colorscale='YlGnBu'), col=1, row=1)
    fig.add_trace(go.Surface(x=diler_axis, y=player_axis, z=values_with_ace, colorscale='YlGnBu'), col=2, row=1)
    fig.layout.scene1.camera=camera
    fig.layout.scene2.camera=camera
    fig.update_layout(scene_camera=camera, title=title,
                      margin=dict(r=25, l=25, b=10, t=80),
                      width=1000,
                      showlegend=False)
    fig.update_scenes(xaxis_title_text='Dealer',  
                      yaxis_title_text='Player',  
                      zaxis_title_text='Reward')
    return fig

value_data_count = np.ones((10, 10), dtype=int)
value_data_wo_ace, value_data_with_ace = sample_rewards()
image_paths = []
for frame_idx in tqdm(range(101)):
    if frame_idx > 0:
        for _ in range(10):
            refine_value_iteration(5e-4)

    fig = draw_value_function(f'Value function on step: {frame_idx * 10}', value_data_wo_ace, value_data_with_ace)

    image_path = temp_dir / f'{frame_idx}.png'
    fig.write_image(image_path)
    image_paths.append(imageio.imread(image_path))
imageio.mimsave('blackjack_default_value_function.mp4', image_paths)



100%|██████████| 101/101 [02:27<00:00,  1.46s/it]


In [4]:
def blackjack_only_one_step(dealer_score: int, player_score: int,
                            player_has_ace: bool, action: int) -> tp.Tuple[float, tp.Optional[tp.Tuple[int, int, bool]]]:
    if action > 0:
        rand_card = np.random.choice(cards)
        if rand_card == 1:
            if player_score > 10:
                player_score += 1
            else:
                player_score += 11
                player_has_ace = True
    if player_score > 21:
        if player_has_ace:
            player_score -= 10
            player_has_ace = False
            return 0, (dealer_score, player_score, player_has_ace)
        return -1, None
    if dealer_score == 1:
        dealer_score += 10
        diler_has_ace = True
    else:
        diler_has_ace = False
    while dealer_score < 17:
        rand_card = np.random.choice(cards)
        if rand_card == 1:
            if diler_has_ace:
                if dealer_score > 10:
                    dealer_score += 1
                else:
                    dealer_score += 11
                    diler_has_ace = True
        else:
            dealer_score += rand_card
        if dealer_score > 21:
            if diler_has_ace:
                dealer_score -= 10
                diler_has_ace = False
            else:
                return 1, None
    if dealer_score > player_score:
        return -1, None
    if dealer_score == player_score:
        return 0, None
    return 1, None


q_wo_ace = np.random.uniform(-1, 1, size=(10, 10, 2))
q_with_ace = np.random.uniform(-1, 1, size=(10, 10, 2))

def update_q(update_factor: float = 1e-3):
    for i in range(10):
        for j in range(10):
            for action in range(2):
                reward, next_state = blackjack_only_one_step(i + 1, j + 12, False, action)
                if next_state is not None:
                    next_dealer_score, next_player_score, next_player_has_ace = next_state
                    if next_player_has_ace:
                        next_q = reward + q_with_ace[next_dealer_score - 1, next_player_score - 12].max()
                    else:
                        next_q = reward + q_wo_ace[next_dealer_score - 1, next_player_score - 12].max()
                else:
                    next_q = reward
                q_wo_ace[i, j, action] += (next_q - q_wo_ace[i, j, action]) * update_factor
                reward, next_state = blackjack_only_one_step(i + 1, j + 12, True, action)
                if next_state is not None:
                    next_dealer_score, next_player_score, next_player_has_ace = next_state
                    if next_player_has_ace:
                        next_q = reward + q_with_ace[next_dealer_score - 1, next_player_score - 12].max()
                    else:
                        next_q = reward + q_wo_ace[next_dealer_score - 1, next_player_score - 12].max()
                else:
                    next_q = reward
                q_with_ace[i, j, action] += (next_q - q_with_ace[i, j, action]) * update_factor
                
if temp_dir.exists():
    shutil.rmtree(temp_dir)
temp_dir.mkdir()

image_paths = []
for frame_idx in tqdm(range(101)):
    if frame_idx > 0:
        for _ in range(100):
            update_q(5e-4)

    cur_v_wo_ace = q_wo_ace.max(axis=2)
    cur_v_with_ace = q_with_ace.max(axis=2)
    fig = draw_value_function(f'New value function on step: {frame_idx * 100}', cur_v_wo_ace, cur_v_with_ace)

    image_path = temp_dir / f'{frame_idx}.png'
    fig.write_image(image_path)
    image_paths.append(imageio.imread(image_path))
imageio.mimsave('blackjack_optimal_value_function.mp4', image_paths)




100%|██████████| 101/101 [04:34<00:00,  2.72s/it]


In [5]:
policy_data_wo_ace = q_wo_ace.argmax(axis=2)
policy_data_with_ace = q_with_ace.argmax(axis=2)

new_policy_reward = estimate_policy()
new_policy_reward

100%|██████████| 1000000/1000000 [01:09<00:00, 14478.72it/s]


-0.177865