In [1]:
import numpy as np 
from typing import Callable
import torch
import torch.nn.functional as F
import random

In [2]:
Card = int
Hand = torch.Tensor
Hands = torch.Tensor
Action = int
Actions = torch.Tensor
Policy = Callable[[Hand, Hand], Action]
MultiPolicy = Callable[[Hands, Hands], Actions]
Deck = torch.Tensor
Decks = torch.Tensor

ACE: Card = 1
TWO: Card = 2
THREE: Card = 3
FOUR: Card = 4
FIVE: Card = 5
SIX: Card = 6
SEVEN: Card = 7
EIGHT: Card = 8
NINE: Card = 9
TEN: Card = 10
JACK: Card = 10
QUEEN: Card = 10
KING: Card = 10

card_values: list[Card] = [ACE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, JACK, QUEEN, KING]
card_values = torch.tensor(card_values, dtype=torch.float32)
num_cards: int = len(card_values)
cards: list[int] = list(range(num_cards))
cards = torch.tensor(cards, dtype=torch.int64)

max_card_value: int = max(card_values)
max_score: int = 21

In [3]:
HIT: Action = 0 
STAND: Action = 1
DOUBLE: Action = 2
SPLIT: Action = 3
SURRENDER: Action = 4

actions_str: dict[Action, str] = {
    HIT: 'Hit',
	STAND: 'Stand',
	DOUBLE: 'Double',
	SPLIT: 'Split',
	SURRENDER: 'Surrender',
}

In [4]:
N = 7
max_card_per_hand = 4 * N
hand_played_per_simulation = 100
nb_simulation = 5
ratio_mean = 1 / nb_simulation

In [5]:
def minScore(ace=0, two=0, three=0, four=0, five=0, six=0, seven=0, eight=0, nine=0, ten=0, jack=0, queen=0, king=0) -> int:
	"""
	Score of some cards
	Each ace is counted as 11 if the total score is less than or equal to 21, otherwise it is counted as 1
	"""
	score = 0
	score += ace * ACE
	score += two * TWO
	score += three * THREE
	score += four * FOUR
	score += five * FIVE
	score += six * SIX
	score += seven * SEVEN
	score += eight * EIGHT
	score += nine * NINE
	score += ten * TEN
	score += jack * JACK
	score += queen * QUEEN
	score += king * KING
	return score

In [6]:
valid_hands_to_index: dict[Hand, int] = {}
idx = 0
for ace in range(0, max_score+1):
	score = minScore(ace)
	for two in range(0, (max_score-score) // TWO +1):
		score = minScore(ace, two)
		for three in range(0, (max_score-score) // THREE +1):
			score = minScore(ace, two, three)
			for four in range(0, (max_score-score) // FOUR +1):
				score = minScore(ace, two, three, four)
				for five in range(0, (max_score-score) // FIVE +1):
					score = minScore(ace, two, three, four, five)
					for six in range(0, (max_score-score) // SIX +1):
						score = minScore(ace, two, three, four, five, six)
						for seven in range(0, (max_score-score) // SEVEN +1):
							score = minScore(ace, two, three, four, five, six, seven)
							for eight in range(0, (max_score-score) // EIGHT +1):
								score = minScore(ace, two, three, four, five, six, seven, eight)
								for nine in range(0, (max_score-score) // NINE +1):
									score = minScore(ace, two, three, four, five, six, seven, eight, nine)
									for ten in range(0, (max_score-score) // TEN +1):
										score = minScore(ace, two, three, four, five, six, seven, eight, nine, ten)
										for jack in range(0, (max_score-score) // JACK +1):
											score = minScore(ace, two, three, four, five, six, seven, eight, nine, ten, jack)
											for queen in range(0, (max_score-score) // QUEEN +1):
												score = minScore(ace, two, three, four, five, six, seven, eight, nine, ten, jack, queen)
												for king in range(0, (max_score-score) // KING +1):
													score = minScore(ace, two, three, four, five, six, seven, eight, nine, ten, jack, queen, king)
													if score <= max_score:
														valid_hands_to_index[(ace, two, three, four, five, six, seven, eight, nine, ten, jack, queen, king)] = idx
														idx += 1

Notre approche est la suivante:
+ Créer un réseau de neuronnes capable de jouer efficacement en ayant connaissances de l'état du deck
+ Créer un réseau de neuronnes capable d'évaluer la mise optimale à jouer en ayant connaissance de l'état du deck

Pour accélérer le processus d'apprentissages, nous avons besoin de faire jouer plusieurs parties simultanément.  
On va donc mettre au point des fonctions agissant sur des matrices dont chaque ligne simule une partie différente.

In [7]:
def drawCard(deck: Deck) -> Card:
	"""
	Draw a card from the deck and return it
	"""
	card = torch.multinomial(deck, 1).item()
	deck[card] -= 1
	return card

def drawCards(decks: Decks, indices) -> torch.Tensor:
	"""
	This function assumes `decks` is a tensor of shape (N, 13), where N is the number of decks.
	`indices` is a tensor of shape (N,) containing the indices of the decks to draw a card from.
	"""
	cards = torch.multinomial(decks[indices], num_samples=1).squeeze(1)
	decks[indices, cards] -= 1
	return cards

In [8]:
def generateRandomDeck() -> Deck:
	"""
	Generate a random deck of cards
	The deck is at first full of cards, then a random number of cards are drawn from it
	The deck has at least the half of the cards
	"""
	deck = torch.tensor([4 * N] * num_cards, dtype=torch.float32)
	num_cards_to_draw = random.randint(1, num_cards * N * 2)
	for _ in range(num_cards_to_draw): drawCard(deck)
	return deck

def generateRandomDecks(num_decks: int) -> Decks:
	"""
	Generate `num_decks` random decks of cards
	"""	
	#return torch.stack([torch.tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]) for _ in range(num_decks)])
	return torch.stack([generateRandomDeck() for _ in range(num_decks)])

In [9]:
def generateRandomInitialHand() -> Hand:
	"""
	Generate a random initial hands for the player and the dealer
	"""
	dealer_card = random.choice(cards)
	card1 = random.choice(cards)
	card2 = random.choice(cards)
	hand = F.one_hot(torch.tensor([card1, card2]), num_classes=num_cards).sum(dim=0)
	return torch.tensor([dealer_card, *hand], dtype=torch.float32)

def generateRandomInitialHands(num_games: int) -> Hands:
	"""
	Generate `num_games` random initial hands for the player and the dealer
	"""
	return torch.stack([generateRandomInitialHand() for _ in range(num_games)])

In [10]:
def generateRandomInitialGames(num_games: int) -> torch.Tensor:
	"""
	Generate `num_games` random initial games
	A game is a concatenation of a deck and two initial hands
	"""
	return torch.cat([generateRandomDecks(num_games), generateRandomInitialHands(num_games)], dim=1)

In [11]:
def handsScoreMin(hands: Hands) -> torch.tensor:
	"""
	Score minimal of a hand
	Each ace is counted as 1 if the total score is less than or equal to 21, otherwise it is counted as 1
	"""	
	return torch.sum(hands * torch.tensor(card_values, dtype=torch.float32), dim=1)

def handsScore(hands: Hands) -> torch.tensor:
	"""
	Score of a hand
	Each ace is counted as 11 if the total score is less than or equal to the max_score (=21), otherwise it is counted as 1
	"""	
	score = handsScoreMin(hands)
	can_use_ace_index = score + 10 <= max_score
	have_ace_index = hands[:, 0] > 0
	score[torch.logical_and(can_use_ace_index, have_ace_index)] += 10
	return score

In [12]:
def gamesFinished(player_hands: Hands, actions: Actions) -> torch.tensor:
	"""
	Check if the game is finished
	A game is finished if the player is standing or has a score greater than 21
	"""
	return torch.logical_or((actions == STAND), (handsScoreMin(player_hands) > max_score))

On peut aussi implémenter les politiques classiques des dealers sous forme multi-games:

In [13]:
def standOn17Hard(dealer_hands: Hands, player_hands: Hands) -> Actions:
	"""
	Dealer stand on 17 hard
	"""
	return torch.where(handsScore(dealer_hands) >= 17, torch.tensor(STAND), torch.tensor(HIT))

def standOn17Soft(dealer_hands: Hands, player_hands: Hands) -> Actions:
	"""
	Dealer stand on 17 soft
	"""
	return torch.where(handsScoreMin(dealer_hands) >= 17, torch.tensor(STAND), torch.tensor(HIT))

In [14]:
dealer_policy: Policy = standOn17Soft

On doit aussi adpater les fonctions qui déterminent le résultat d'une partie 

In [15]:
def checkBlackjacks(hands: Hands, scores: torch.Tensor) -> torch.tensor:
	"""
	Check if a hand is a blackjack
	Score is the score of the hand if it is already computed
	"""
	return torch.logical_and(torch.eq(scores, max_score), torch.eq(torch.sum(hands, dim=1), 2))

In [16]:
def handsComparisonPlayerPOV(dealer_hands: Hands, player_hands: Hands) -> torch.tensor:
	"""
	Compare two valid hands
	Return 1 if the dealer wins, 0 if it is a draw and -1 if the player wins
	Check if there is a blackjack for the dealer and the player
 
	The order of the verification is really important
	"""
	result = torch.zeros(dealer_hands.shape[0])
	score_dealers = handsScore(dealer_hands)
	score_players = handsScore(player_hands)
	dealer_blackjacks = checkBlackjacks(dealer_hands, score_dealers)
	player_blackjacks = checkBlackjacks(player_hands, score_players)
	result[score_dealers > score_players] = -1
	result[score_dealers < score_players] = 1
	result[score_dealers > max_score] = 1
	result[dealer_blackjacks] = -1
	result[player_blackjacks] = 1.5
	result[torch.logical_and(dealer_blackjacks, player_blackjacks)] = 0
	result[score_players > max_score] = -1
	return result

In [17]:
def finishGames(dealer_hands: Hands, player_hands: Hands, decks: Decks) -> torch.tensor:
	"""
	Resolve every game
	Return the reward for the player
	"""
	need_actions = torch.tensor([True] * dealer_hands.shape[0])
	while torch.any(need_actions):
		actions = dealer_policy(dealer_hands, player_hands)
		hit_indices = torch.where(actions == HIT)[0]
		dealer_hands.index_add_(0, hit_indices, F.one_hot(drawCards(decks, hit_indices), num_classes=13).to(torch.float32))
		need_actions = torch.logical_not(gamesFinished(player_hands, actions))
	return handsComparisonPlayerPOV(dealer_hands, player_hands)

On peut enfin simuler un ensemble de parties simultanément

In [18]:
@torch.no_grad()
def simulateGames(hands_and_decks: torch.Tensor, model_play):
	"""
	Simulate games with the given initial hands and decks
	Return the gain of the player
	"""
	num_games: int = hands_and_decks.shape[0]
	decks = hands_and_decks[:, :13]
	dealer_cards = hands_and_decks[:, 13]
	player_hands = hands_and_decks[:, 14:]
	need_actions = torch.tensor([True] * num_games)
	while torch.any(need_actions):
		actions = torch.tensor([STAND] * num_games)
		actions[need_actions] = torch.argmax(model_play(hands_and_decks[need_actions]), dim=1)
		hit_indices = torch.where(actions == HIT)[0]
		player_hands.index_add_(0, hit_indices, F.one_hot(drawCards(decks, hit_indices), num_classes=13).to(torch.float32))
		need_actions = torch.logical_not(gamesFinished(player_hands, actions))
	dealer_hands = F.one_hot(dealer_cards.to(torch.int64), num_classes=num_cards).to(torch.float32)
	return finishGames(dealer_hands, player_hands, decks)
	

In [19]:
@torch.no_grad()
def simulateGameRepeat(hands_and_deck: torch.Tensor, model_play):
	"""
	Simulate a game
	Return the gain of the player
	"""
	mean_score = torch.zeros(hands_and_deck.shape[0])
	for _ in range(nb_simulation):
		inital_hand_and_deck_copy = hands_and_deck.clone()
		mean_score += simulateGames(inital_hand_and_deck_copy, model_play) / nb_simulation
	return mean_score

In [20]:
@torch.no_grad()
def simulateGameAfterFirstAction(hands_and_deck:torch.Tensor, model_play, actions: torch.Tensor):
	"""
	Simulate a game
	Return the gain of the player
	"""
	num_games = hands_and_deck.shape[0]
	decks = hands_and_deck[:, :13]
	dealer_cards = hands_and_deck[:, 13]
	player_hands = hands_and_deck[:, 14:]
	need_actions = torch.logical_not(gamesFinished(player_hands, actions))
	while torch.any(need_actions):
		hit_indices = torch.where(actions == HIT)[0]
		player_hands.index_add_(0, hit_indices, F.one_hot(drawCards(decks, hit_indices), num_classes=num_cards).to(torch.float32))
		actions[need_actions] = torch.argmax(model_play(hands_and_deck[need_actions]), dim=1)
		actions[torch.logical_not(need_actions)] = STAND
		need_actions = torch.logical_not(gamesFinished(player_hands, actions))
	dealer_hands = F.one_hot(dealer_cards.to(torch.int64), num_classes=num_cards).to(torch.float32)
	
	mean_score = torch.zeros(num_games)
	for _ in range(nb_simulation):
		dealer_hands_copy = dealer_hands.clone()
		player_hands_copy = player_hands.clone()
		decks_copy = decks.clone()
		mean_score += finishGames(dealer_hands_copy, player_hands_copy, decks_copy) * ratio_mean
	return mean_score

In [21]:
@torch.no_grad()
def simulateGameAfterFirstActionRepeat(hands_and_deck: torch.Tensor, model_play, actions:torch.Tensor):
	"""
	Simulate a game
	Return the gain of the player
	"""
	mean_score = torch.zeros(hands_and_deck.shape[0])
	for _ in range(nb_simulation):
		inital_hand_and_deck_copy = hands_and_deck.clone()
		mean_score += simulateGameAfterFirstAction(inital_hand_and_deck_copy, model_play, actions) * ratio_mean
	return mean_score

On peut maintenant paramétrer l'architecture choisie et créer des modèles

In [22]:
bet_model_hidden_layers = [13]
play_model_hidden_layers = [27]


In [23]:
def makeBetModel() -> torch.nn.Module:
	"""
	Create the bet model
	"""
	layers = [13, *bet_model_hidden_layers, 1]
	sequence = []
	for i in range(len(layers)-1):
		sequence.append(torch.nn.Linear(layers[i], layers[i+1], dtype=torch.float32))
		sequence.append(torch.nn.ReLU())
	return torch.nn.Sequential(*sequence)

def makePlayModel() -> torch.nn.Module:
	"""
	Create the play model
	"""
	layers = [27, *play_model_hidden_layers, 2]
	sequence = []
	for i in range(len(layers)-1):
		sequence.append(torch.nn.Linear(layers[i], layers[i+1], dtype=torch.float32))
		sequence.append(torch.nn.ReLU())
	sequence.pop()
	sequence.append(torch.nn.Tanh())
	return torch.nn.Sequential(*sequence)

def makePlayModelPrior():
	"""
	Create the play model
	"""
	layers = [27, *play_model_hidden_layers, 2]
	sequence = []
	for i in range(len(layers)-1):
		sequence.append(torch.nn.Linear(layers[i], layers[i+1], dtype=torch.float32))
		sequence.append(torch.nn.ReLU())
	sequence.pop()
	sequence.append(torch.nn.Softmax(dim=1))
	return torch.nn.Sequential(*sequence)

In [24]:
def modelBaseline(hands_and_decks: torch.Tensor) -> torch.Tensor:
	"""
	Compute the model with the matrix of Q1
	"""
	goal_gains_matrix = torch.tensor(np.load("gain.npy"), dtype=torch.float32)
	dealer_cards = hands_and_decks[:, 13]
	player_hands = hands_and_decks[:, 14:]
	goal_gains = torch.zeros((hands_and_decks.shape[0], 2))
	for i, (dealer_card, player_hands) in enumerate(zip(dealer_cards, player_hands)):
		player_hands = tuple(player_hands.to(torch.int64).tolist())
		dealer_card = dealer_card.to(torch.int64).item()
		goal_gain = goal_gains_matrix[valid_hands_to_index[player_hands], dealer_card]
		goal_gains[i] = goal_gain
	return goal_gains

In [25]:
def simulateGameWithBet(hands_and_decks: torch.Tensor, model_play, model_bet):
	"""
	Simulate a game
	Return the gain of the player
	"""
	decks = hands_and_decks[:, :13]
	dealer_cards = hands_and_decks[:, 13]
	player_hands = hands_and_decks[:, 14:]
	need_actions = torch.tensor([True] * hands_and_decks.shape[0])
	while torch.any(need_actions):
		actions = torch.tensor([STAND] * hands_and_decks.shape[0])
		actions[need_actions] = torch.argmax(model_play(hands_and_decks[need_actions]), dim=1)
		hit_indices = torch.where(actions == HIT)[0]
		player_hands.index_add_(0, hit_indices, F.one_hot(drawCards(decks, hit_indices), num_classes=13).to(torch.float32))
		need_actions = torch.logical_not(gamesFinished(player_hands, actions))
	dealer_hands = F.one_hot(dealer_cards.to(torch.int64), num_classes=num_cards).to(torch.float32)
	mean_score = torch.zeros(hands_and_decks.shape[0])
	bet = model_bet(decks).squeeze(1)
	for i in range(nb_simulation):
		dealer_hands_copy = dealer_hands.clone()
		player_hands_copy = player_hands.clone()
		decks_copy = decks.clone()
		mean_score += finishGames(dealer_hands_copy, player_hands_copy, decks_copy) / nb_simulation * bet
	return mean_score

In [26]:
def simulateGameWithBetRepeat(hands_and_decks: torch.Tensor, model_play, model_bet):
	"""
	Simulate a game
	Return the gain of the player
	"""
	mean_score = torch.zeros(hands_and_decks.shape[0])
	for _ in range(nb_simulation):
		inital_hand_and_deck_copy = hands_and_decks.clone()
		mean_score += simulateGameWithBet(inital_hand_and_deck_copy, model_play, model_bet) / nb_simulation
	return mean_score


In [27]:
@torch.no_grad()
def evalModel(model_baseline, model_play, batch_size: int):
	"""
	Evaluate the model
	"""
	model_play.eval()
	hands_and_decks = generateRandomInitialGames(batch_size)
	hands_and_decks_cp = hands_and_decks.clone()
	gains = simulateGameRepeat(hands_and_decks, model_play)
	average_gain = torch.mean(gains)
	gains_baseline = simulateGameRepeat(hands_and_decks_cp, model_baseline)
	average_gain_baseline = torch.mean(gains_baseline)
	return average_gain, average_gain_baseline

In [28]:
def trainPlayModel(model_baseline, model_play, optimizer_play, batch_size: int, num_epochs: int):
	model_play
	for epoch in range(num_epochs + 1):
		model_play.train()
		hands_and_decks = generateRandomInitialGames(batch_size)
		hands_and_decks_cp1 = hands_and_decks.clone()
		hands_and_decks_cp2 = hands_and_decks.clone()
		
		goal_gains = torch.zeros((batch_size, 2))
		actions_hit = torch.tensor([HIT] * hands_and_decks.shape[0])
		goal_gains[:, 0] = simulateGameAfterFirstActionRepeat(hands_and_decks_cp1, model_play, actions_hit)
		actions_stand = torch.tensor([STAND] * hands_and_decks.shape[0])
		goal_gains[:, 1] = simulateGameAfterFirstActionRepeat(hands_and_decks_cp2, model_play, actions_stand)
		optimizer_play.zero_grad()
		actions = model_play(hands_and_decks)
		loss = F.mse_loss(actions, goal_gains)
		loss.backward()
		optimizer_play.step()
		if epoch % 100 == 0:
			average_gain, average_gain_baseline = evalModel(model_baseline, model_play, batch_size)
			print(f"epoch: {epoch}, loss: {loss.item()}, average gain: {average_gain}, average gain baseline: {average_gain_baseline}")

In [29]:
def trainPlayModelPrior(model_baseline, model_play, optimizer_play, batch_size: int, num_epochs: int):
	goal_gains_matrix = torch.tensor(np.load("gain.npy"), dtype=torch.float32)
	for epoch in range(num_epochs + 1):
		hands_and_decks = generateRandomInitialGames(batch_size)
		dealer_cards = hands_and_decks[:, 13]
		player_hands = hands_and_decks[:, 14:]
		goal_gains = torch.zeros((batch_size, 2))

		for i, (dealer_card, player_hands) in enumerate(zip(dealer_cards, player_hands)):
			player_hands = tuple(player_hands.to(torch.int64).tolist())
			dealer_card = dealer_card.to(torch.int64).item()
			goal_gain = torch.softmax(goal_gains_matrix[valid_hands_to_index[player_hands], dealer_card], dim=0)
			goal_gains[i] = goal_gain
		optimizer_play.zero_grad()
		actions = model_play(hands_and_decks)
		loss = F.cross_entropy(actions, goal_gains)
		loss.backward()
		optimizer_play.step()
		if epoch % 100 == 0:
			average_gain, average_gain_baseline = evalModel(model_baseline, model_play, batch_size)
			print(f"epoch: {epoch}, loss: {loss.item()}, average gain: {average_gain}, average gain baseline: {average_gain_baseline}")

In [30]:
def trainBetModel(model_play, model_bet,  optimizer_bet, batch_size: int, hand_played_per_deck: int, num_epochs: int):
	model_play.eval()
	for epoch in range(num_epochs):
		initial_decks = generateRandomDecks(batch_size)
		gain_deck = torch.zeros((batch_size, hand_played_per_deck))
		for i in range(hand_played_per_deck):
			hands = generateRandomInitialHands(batch_size)
			hands_and_decks = torch.cat([initial_decks, hands], dim=1)
			hands_and_decks_cp1 = hands_and_decks.clone()
			hands_and_decks_cp2 = hands_and_decks.clone()
			hands_and_decks_cp3 = hands_and_decks.clone()
			goal_gains = torch.zeros((batch_size, 2))
			actions_hit = torch.tensor([HIT] * hands_and_decks.shape[0])
			goal_gains[:, 0] = simulateGameAfterFirstActionRepeat(hands_and_decks_cp1, model_play, actions_hit)
			actions_stand = torch.tensor([STAND] * hands_and_decks.shape[0])
			goal_gains[:, 1] = simulateGameAfterFirstActionRepeat(hands_and_decks_cp2, model_play, actions_stand)
			goal_gains = torch.max(goal_gains, 1).values
			gain_deck[:, i] = goal_gains
		gain_deck = (torch.max(torch.tensor(-0.1), gain_deck.mean(dim=1)) + 0.1) * 100
		bet = model_bet(initial_decks).squeeze(1)
		optimizer_bet.zero_grad()
		loss = F.mse_loss(bet, goal_gains)
		loss.backward()
		optimizer_bet.step()
		if epoch % 100 == 0:
			gains = simulateGameWithBetRepeat(hands_and_decks_cp3, model_play, model_bet)
			average_gain = torch.mean(gains)
			print(f"epoch: {epoch}, loss: {loss.item()}, average gain: {average_gain}")
	gains = simulateGameWithBetRepeat(hands_and_decks_cp3, model_play, model_bet)
	average_gain = torch.mean(gains)
	print(f"epoch: {epoch}, loss: {loss.item()}, average gain: {average_gain}")

In [31]:
batch_size = 512
num_epochs = 1000
hand_played_per_deck = 4

On est maintenant en mesure de faire jouer un modèle (non entraîné)

In [None]:
model_play = makePlayModelPrior()
optimizer_play = torch.optim.Adam(model_play.parameters(), lr=0.0001)
trainPlayModelPrior(modelBaseline, model_play, optimizer_play, batch_size, num_epochs)

In [None]:
model_play = makePlayModel()
optimizer_play = torch.optim.Adam(model_play.parameters(), lr=0.0001)
trainPlayModel(modelBaseline, model_play, optimizer_play, batch_size, num_epochs)

In [None]:
model_bet = makeBetModel()
optimizer_bet = torch.optim.Adam(model_bet.parameters(), lr=0.0001)
trainBetModel(model_play, model_bet, optimizer_bet, batch_size, hand_played_per_deck, num_epochs)