In [61]:
!pip install gymnasium[toy-text]
!pip install tqdm
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Objetivo do Jogo

O jogo começa com o croupier (dealer) tendo uma carta virada para cima e uma virada para baixo, enquanto o jogador tem duas cartas viradas para cima. Todas as cartas são retiradas de um baralho infinito (ou seja, com reposição).

Os valores das cartas são:
- Cartas de figura (Valete, Dama, Rei) têm um valor de 10 pontos.
- Ases podem contar como 11 (chamado de 'ás utilizável') ou 1.
- Cartas numéricas (2-10) têm um valor igual ao seu número.

O jogador tem a soma das cartas que possui. O jogador pode solicitar cartas adicionais (pedir/hit) até decidir parar (manter/stick) ou exceder 21 (estourar/bust, perda imediata).

Depois que o jogador mantém, o croupier revela sua carta virada para baixo e compra cartas até que sua soma seja 17 ou maior. Se o croupier estourar, o jogador ganha.

Se nem o jogador nem o croupier estourarem, o resultado (ganhar, perder, empatar) é decidido por quem tiver a soma mais próxima de 21.

# Objetivo do Agente

O agente deve tentar ganhar o jogo escolhendo se continua (hit) ou para a compra de cartar (stick).

# Caracteristicas do Ambiente

## Espaço de Observação

A observação consiste em uma tupla de 3 elementos contendo: a soma atual do jogador, o valor da carta visível do croupier (1-10, onde 1 é o ás) e se o jogador possui um ás utilizável (0 ou 1).

## Espaço de Ações

O agente tem 2 ações possíveis, são elas:
- STICK
- HIT

## Recompensas
As recompensas possíveis:
- Vencer: +1
- Perder: -1
- Empate: +0
Se ganhar o jogo com blackjack natural:
- (se natural for Verdadeiro): +1.5
- (se natural for Falso): +1

# Setup do Ambiente

Cria o ambiente que será utilizado: "Blackjack", e suas dependecias.

In [62]:
import gymnasium as gym

env = gym.make("Blackjack-v1", natural=False, sab=False)
env.reset()

((17, 10, 0), {})

In [63]:
env.render()

  gym.logger.warn(


# Tabela Q

In [64]:
import numpy as np

shape = tuple(map(lambda x: x.n.item(), env.observation_space)) + (env.action_space.n, )

q_table = np.zeros(shape, dtype=np.float32)
q_table.shape

(32, 11, 2, 2)

In [65]:
import random
from IPython.display import clear_output
import numpy as np

## Treinar tabela Q

In [66]:
from typing import NamedTuple, Any

class FloatRange:
    def __init__(self, start, stop, step):
        if step == 0:
            raise ValueError("step must not be zero")
        self.start = start
        self.stop = stop
        self.step = step
        # Precompute the length
        self._len = max(0, int((stop - start) / step + (1e-10)))  # small tolerance for floats

    def __len__(self):
        return self._len

    def __getitem__(self, index):
        if isinstance(index, slice):
            # Support slicing
            start, stop, stride = index.indices(len(self))
            return FloatRange(self.start + start * self.step,
                              self.start + stop * self.step,
                              self.step * stride)
        if index < 0:
            index += len(self)
        if index < 0 or index >= len(self):
            raise IndexError("FloatRange index out of range")
        return self.start + index * self.step

    def __iter__(self):
        current = self.start
        for _ in range(len(self)):
            yield current
            current += self.step

    def __repr__(self):
        return f"FloatRange({self.start}, {self.stop}, {self.step})"

class Permute:

    def __init__(self, **kwargs):
        self.args = tuple(kwargs.keys())
        self.options = kwargs
        self.counter = [0 for _ in range(len(self.args))]

    def __iter__(self):
        self.counter = [0 for _ in range(len(self.args))]
        return self

    def __next__(self):
        output_value = {
            option_name: self.options[option_name][count]
            for option_name, count in zip(self.args, self.counter)
        }

        self.counter[0] += 1

        for i, count in enumerate(self.counter):
            if count >= len(self.options[self.args[i]]):
                if i == len(self.counter) - 1:
                    raise StopIteration

                self.counter[i] = 0
                self.counter[i + 1] += 1
                continue

            break

        return output_value


In [67]:
from typing import Callable, Any
from tqdm.auto import trange
import random
import numpy as np

def sample(eps, state, env, q_table):
    if random.random() < eps:
        return env.action_space.sample()
    return np.argmax(q_table[state])


def train_epoch(alpha, gamma, eps, env, q_table, use_sarsa: bool = False):
    state, _ = env.reset()
    finished = False

    action = sample(eps, state, env, q_table)
    while not finished:
        next_state, reward, finished, _, info = env.step(action)

        old_q = q_table[*state, action]
        if use_sarsa:
            next_action = sample(eps, next_state, env, q_table)
            next_value = q_table[*next_state, next_action]
        else:
            next_value = np.max(q_table[next_state])

        q_table[*state, action] = (
            (1 - alpha) * old_q
            + alpha * (reward + gamma * next_value)
        )

        state = next_state
        if use_sarsa:
            action = next_action
        else:
            action = sample(eps, state, env, q_table)

Epoch = int
Params = tuple[float, float, float]
def train(q_table, env,
          alpha, gamma, eps,
          n_epochs=10000,
          reduce_eps_every_n_epochs=1000,
          eps_mul=0.5,
          use_sarsa: bool = False,
          callbacks: list[tuple[int, Callable[[Epoch, tuple[Params]], None]]] = []):

    for i in trange(1, n_epochs):

        train_epoch(alpha, gamma, eps, env, q_table, use_sarsa)

        if i % reduce_eps_every_n_epochs == 0:
            eps *= eps_mul

        for interval, callback in callbacks:
            if i % interval == 0:
                callback(i, (alpha, gamma, eps))

    return q_table

def evaluate(q_table, env, epochs = 10):
    lost_like_a_little_bitch = 0
     
    for _ in range(epochs):
        state, _ = env.reset()
        finished = False

        while not finished:
            action = np.argmax(q_table[state])
            state, reward, finished, _, info = env.step(action)

            if reward == -1:
                lost_like_a_little_bitch += 1
        
    return lost_like_a_little_bitch

def play_for_us(q_table, epochs = 10):
    env = gym.make("Blackjack-v1", render_mode='human', natural=False, sab=False)

    for _ in range(epochs):
        state, _ = env.reset()
        finished = False

        while not finished:
            action = np.argmax(q_table[state])
            state, reward, finished, _, info = env.step(action)

    env.close()


# Testar diferentes permutações de argumentos

Usando a classe Permute podemos testar todas as combinações de argumentos, similar ao GrindSearch. Colocamos todos os resultados de alpha, gamma e esp numa tabela e comparamos quantos jogos foram perdidos em 10 jogadas.

In [72]:
alpha = [0.01, 0.05, 0.1, 0.5, 1.0]
gamma = FloatRange(0.1, 1.0, 0.1)
eps = [0.1, 0.2, 0.3, 0.4, 0.5]

permutations = Permute(alpha=alpha, gamma=gamma, eps=eps)

table = {'sarsa': [], 'q-learning': []}

for mode in ['q-learning', 'sarsa']:
    for permut in permutations:
        q_table[:] = 0
        train(q_table, env, permut['alpha'], permut['gamma'], permut['eps'])
        lost = evaluate(q_table, env)

        permut['lost'] = lost
        table[mode].append(permut)

100%|██████████| 9999/9999 [00:01<00:00, 5106.36it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5088.01it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5449.26it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5170.46it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5387.82it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5821.79it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5696.14it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5495.28it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5049.68it/s]
100%|██████████| 9999/9999 [00:02<00:00, 4938.07it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5180.94it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5132.33it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5174.06it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5034.04it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5308.90it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5511.13it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5610.40it/s]
100%|██████████| 9999/9999 [00:01<00:00, 5534.76it/s]
100%|██████████| 9999/9999 [

In [74]:
import pandas as pd

sarsa = pd.DataFrame(table['sarsa'])
sarsa.sort_values(by='lost')

Unnamed: 0,alpha,gamma,eps,lost
7,0.10,0.2,0.1,1
5,0.01,0.2,0.1,2
10,0.01,0.3,0.1,2
47,0.10,0.1,0.2,2
63,0.50,0.4,0.2,2
...,...,...,...,...
187,0.10,0.2,0.5,9
184,1.00,0.1,0.5,9
29,1.00,0.6,0.1,10
154,1.00,0.4,0.4,10


In [75]:
q_learning = pd.DataFrame(table['q-learning'])
q_learning.sort_values(by='lost')

Unnamed: 0,alpha,gamma,eps,lost
56,0.05,0.3,0.2,2
90,0.01,0.1,0.3,2
145,0.01,0.3,0.4,2
21,0.05,0.5,0.1,3
22,0.10,0.5,0.1,3
...,...,...,...,...
59,1.00,0.3,0.2,10
24,1.00,0.5,0.1,10
149,1.00,0.3,0.4,10
189,1.00,0.2,0.5,10
