In [1]:
from typing import Optional

import gym
from gym import spaces
from gym.utils import seeding


def cmp(a, b):
    return float(a > b) - float(a < b)


# 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10
deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]


def draw_card(np_random):
    return int(np_random.choice(deck))


def draw_hand(np_random):
    return [draw_card(np_random), draw_card(np_random)]


def usable_ace(hand):  # Does this hand have a usable ace?
    return 1 in hand and sum(hand) + 10 <= 21


def sum_hand(hand):  # Return current hand total
    if usable_ace(hand):
        return sum(hand) + 10
    return sum(hand)


def is_bust(hand):  # Is this hand a bust?
    return sum_hand(hand) > 21


def score(hand):  # What is the score of this hand (0 if bust)
    return 0 if is_bust(hand) else sum_hand(hand)


def is_natural(hand):  # Is this hand a natural blackjack?
    return sorted(hand) == [1, 10]


class BlackjackEnv(gym.Env):
    """Simple blackjack environment
    Blackjack is a card game where the goal is to obtain cards that sum to as
    near as possible to 21 without going over.  They're playing against a fixed
    dealer.
    Face cards (Jack, Queen, King) have point value 10.
    Aces can either count as 11 or 1, and it's called 'usable' at 11.
    This game is placed with an infinite deck (or with replacement).
    The game starts with dealer having one face up and one face down card, while
    player having two face up cards. (Virtually for all Blackjack games today).
    The player can request additional cards (hit=1) until they decide to stop
    (stick=0) or exceed 21 (bust).
    After the player sticks, the dealer reveals their facedown card, and draws
    until their sum is 17 or greater.  If the dealer goes bust the player wins.
    If neither player nor dealer busts, the outcome (win, lose, draw) is
    decided by whose sum is closer to 21.  The reward for winning is +1,
    drawing is 0, and losing is -1.
    The observation of a 3-tuple of: the players current sum,
    the dealer's one showing card (1-10 where 1 is ace),
    and whether or not the player holds a usable ace (0 or 1).
    This environment corresponds to the version of the blackjack problem
    described in Example 5.1 in Reinforcement Learning: An Introduction
    by Sutton and Barto.
    http://incompleteideas.net/book/the-book-2nd.html
    """

    def __init__(self, natural=False, sab=False):
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Tuple(
            (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2))
        )

        # Flag to payout 1.5 on a "natural" blackjack win, like casino rules
        # Ref: http://www.bicyclecards.com/how-to-play/blackjack/
        self.natural = natural

        # Flag for full agreement with the (Sutton and Barto, 2018) definition. Overrides self.natural
        self.sab = sab

    def step(self, action):
        assert self.action_space.contains(action)
        if action:  # hit: add a card to players hand and return
            self.player.append(draw_card(self.np_random))
            if is_bust(self.player):
                done = True
                reward = -1.0
            else:
                done = False
                reward = 0.0
        else:  # stick: play out the dealers hand, and score
            done = True
            while sum_hand(self.dealer) < 17:
                self.dealer.append(draw_card(self.np_random))
            reward = cmp(score(self.player), score(self.dealer))
            if self.sab and is_natural(self.player) and not is_natural(self.dealer):
                # Player automatically wins. Rules consistent with S&B
                reward = 1.0
            elif (
                not self.sab
                and self.natural
                and is_natural(self.player)
                and reward == 1.0
            ):
                # Natural gives extra points, but doesn't autowin. Legacy implementation
                reward = 1.5
        return self._get_obs(), reward, done, {}

    def _get_obs(self):
        return (sum_hand(self.player), self.dealer[0], usable_ace(self.player))

    def reset(self, seed: Optional[int] = None):
        super().reset(seed=seed)
        self.dealer = draw_hand(self.np_random)
        self.player = draw_hand(self.np_random)
        return self._get_obs()

In [141]:
import torch

class BJnn(torch.nn.Module):
    def __init__(self):
        super(BJnn, self).__init__()
        self.layer1 = torch.nn.Linear(3, 512)
        self.layer2 = torch.nn.Linear(512, 256)
        self.layer3 = torch.nn.Linear(256, 128)
        self.layer4 = torch.nn.Linear(128, 64)
        self.layer5 = torch.nn.Linear(64, 32)
        self.layer6 = torch.nn.Linear(32, 8)
        self.layer7 = torch.nn.Linear(8, 2)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.sigmoid(self.layer7(x))
        x = x / x.sum()
        return x
    
model = BJnn()

In [151]:
import torch
import numpy as np

round_num = 10
episode = 3000
learning_rate = 0.001
epsilon = 0.01

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
env = gym.make('Blackjack-v1')

for i in range(round_num):
    reward_sum = 0
    for j in range(episode):
        p_all = torch.tensor([0.0])
        p_all = p_all.to(device)
        
        state = env.reset()
        if state[2]:
            s = torch.tensor([float(state[0]), float(state[1]), 1.0])
        else:
            s = torch.tensor([float(state[0]), float(state[1]), 0.0])

        end = False
        while not end:
            s = s.to(device)
            p = model(s) # p[0] is the possibility of hit
            #print(p.data)
            
            rand_num = np.random.random()
            if rand_num > epsilon:
                if p.data[0] > p.data[1]:
                    a = True
                    p_coeff = torch.tensor([1, 0])
                else:
                    a = False
                    p_coeff = torch.tensor([0, 1])
            else:
                rand_num = np.random.random()
                if rand_num < p.data[0]:
                    a = True
                    p_coeff = torch.tensor([1, 0])
                else:
                    a = False
                    p_coeff = torch.tensor([0, 1])
                
            p_coeff = p_coeff.to(device)
            p_all += (p * p_coeff).sum()
            
            state = env.step(a)
            reward = state[1]
            end = state[2]
            if end:
                break

            if state[0][2]:
                s = torch.tensor([float(state[0][0]), float(state[0][1]), 1.0])
            else:
                s = torch.tensor([float(state[0][0]), float(state[0][1]), 0.0])

        reward_sum += reward
        loss = -1 * reward * p_all

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('round', i, 'average reward =', reward_sum / episode)
    

round 0 average reward = -0.08966666666666667
round 1 average reward = -0.07266666666666667
round 2 average reward = -0.10433333333333333
round 3 average reward = -0.11266666666666666
round 4 average reward = -0.1
round 5 average reward = -0.07766666666666666
round 6 average reward = -0.085
round 7 average reward = -0.10533333333333333
round 8 average reward = -0.094
round 9 average reward = -0.07866666666666666


In [152]:
torch.save(model, '20211213BlackJack.pkl')