In [93]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd


pd.options.display.max_columns = 21
pd.options.display.float_format = '{:.2f}'.format

### Paramètres

Dans un premier temps on considère que les as valent toujours 1  
On considère aussi que le blackjack (As + Bûche) est un état comme les autres

In [None]:
card = int

ACE: card = 1
TWO: card = 2
THREE: card = 3
FOUR: card = 4
FIVE: card = 5
SIX: card = 6
SEVEN: card = 7
EIGHT: card = 8
NINE: card = 9
TEN: card = 10
JACK: card = 10
QUEEN: card = 10
KING: card = 10

card_values: list[card] = [ACE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, JACK, QUEEN, KING]

max_card_value: int = max(card_values)
max_score: int = 21

On ne considère aussi que deux actions possibles pour le joueur

In [95]:
action = int

HIT: action = 0 
STAND: action = 1

On considère aussi un tirage avec remise, on a donc une distribution uniforme des cartes à tout moment

In [96]:
def probaDraw(card: card) -> float:
	"""
	Probability of hitting a given card
	"""
	return 1 / len(card_values)

### Calcul de la politique optimale du croupier

In [97]:
def probaHit(score: int) -> float:
	"""
	Probability of hitting and not bursting
	"""
	return sum([probaDraw(card) for card in card_values if score + card <= max_score])

def probaHitList(score: int) -> list[int, float]:
	"""
	Compute the score and the probability of hitting for each card that do not burst the score
	"""
	return [(score + card, probaDraw(card)) for card in card_values if score + card <= max_score]

In [98]:
def probaBurst(score: int) -> float:
	"""
	Probability of bursting the score
	"""
	return 1 - probaHit(score)

def probaBurstList(score: int) -> list[int, float]:
	"""
	Compute the score and the probability of bursting for each card that do burst the score
	"""
	return [(score + card, probaDraw(card)) for card in card_values if score + card > max_score]

In [99]:
def getGainMatrixDealerFromPolicy(policy) -> np.ndarray:
	"""
	Compute the gain matrix for the dealer from a given policy
	"""
	GainMatrix = np.zeros((max_score, max_score))
	for sp in range(max_score):
		for sd in range(max_score-1, -1, -1):
			dealer_action: action = policy(sp+1, sd+1)
			if dealer_action == HIT:
				for new_sd, proba in probaHitList(sd+1):
					GainMatrix[sp, sd] += proba * GainMatrix[sp, new_sd-1]
				GainMatrix[sp, sd] -= probaBurst(sd+1)
			elif dealer_action == STAND:
				GainMatrix[sp, sd] = 1 if sp < sd else -1 if sp > sd else 0
			else:
				raise Exception("Invalid dealer action")
	return GainMatrix

On peut définir maintenant différentes politiques et calculer leur gains moyens  

On implémente la stratégie classique "stand on 17"  

En considérant que le joueur est seul face au croupier on peut alors intuitivement définir la politique optimale du croupier de la façon suivante:  
+  Si le croupier gagne déjà au score: STAND
+  Si le croupier est perdant au score: HIT
+  Si il y a égalité: Déterminer le meilleur choix entre HIT et faire une partie nulle

In [100]:
def standOn17(sp: int, sd: int) -> action:
	"""
	Dealer policy: stand on 17
	"""
	return STAND if sd >= 17 else HIT

In [101]:
def dealerOptimalPolicy(sp: int, sd: int) -> action:
	"""
	Optimal policy
	"""
	if sp == sd:
		return HIT if probaHit(sp) >= probaBurst(sp) else STAND
	return HIT if sp > sd else STAND

In [109]:
def printGainMatrix(gainMatrix: np.ndarray):
	"""
	Print the gain matrix
	"""
	df = pd.DataFrame(gainMatrix, index=range(1, len(gainMatrix)+1), columns=range(1, len(gainMatrix[0])+1))
	return df

In [110]:
dealerGain17 = getGainMatrixDealerFromPolicy(standOn17)
printGainMatrix(dealerGain17)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
1,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
2,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
3,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
4,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
5,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
6,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
7,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
8,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
9,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0
10,0.4,0.25,0.22,0.18,0.14,0.1,0.43,0.47,0.51,0.54,0.58,0.03,-0.04,-0.11,-0.17,-0.23,1.0,1.0,1.0,1.0,1.0


In [111]:
dealerOptimalGain = getGainMatrixDealerFromPolicy(dealerOptimalPolicy)
printGainMatrix(dealerOptimalGain)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Avec ces gains on peut maintenant calculer la politique optimale du joueur face à cet adversaire

In [112]:
def getGainMatrixPlayerOptimalPolicy(dealerGainMatrix: np.ndarray) -> np.ndarray:
	"""
	Compute the gain matrix for the player using the optimal policy
	"""
	GainMatrix = np.zeros((max_score, max_card_value))
	for sp in range(max_score-1, -1, -1):
		for sd in range(max_card_value-1, -1, -1):
			hit_score = 0
			for new_sp, proba in probaHitList(sp+1):
				hit_score += proba * GainMatrix[new_sp-1, sd]
			hit_score -= probaBurst(sp+1)
   
			stand_score = - dealerGainMatrix[sp, sd]
			GainMatrix[sp, sd] = max(hit_score, stand_score)
	return GainMatrix

In [None]:
playerOptimalGain = getGainMatrixPlayerOptimalPolicy(dealerOptimalGain)
printGainMatrix(playerOptimalGain)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,-0.12,-0.03,-0.01,0.01,0.03,0.03,-0.01,-0.06,-0.12,-0.2
2,-0.24,-0.15,-0.13,-0.11,-0.09,-0.09,-0.13,-0.18,-0.24,-0.32
3,-0.26,-0.18,-0.16,-0.14,-0.12,-0.12,-0.15,-0.2,-0.26,-0.34
4,-0.28,-0.2,-0.18,-0.16,-0.14,-0.15,-0.18,-0.23,-0.29,-0.36
5,-0.31,-0.23,-0.2,-0.17,-0.15,-0.17,-0.21,-0.25,-0.31,-0.38
6,-0.3,-0.2,-0.17,-0.15,-0.12,-0.13,-0.23,-0.28,-0.33,-0.4
7,-0.26,-0.16,-0.13,-0.1,-0.08,-0.08,-0.13,-0.25,-0.3,-0.37
8,-0.2,-0.1,-0.07,-0.05,-0.02,-0.02,-0.05,-0.13,-0.26,-0.32
9,-0.13,-0.02,0.0,0.02,0.05,0.05,0.02,-0.02,-0.12,-0.26
10,-0.03,0.07,0.09,0.12,0.14,0.14,0.11,0.07,0.02,-0.11


In [114]:
playerOptimalGain17 = getGainMatrixPlayerOptimalPolicy(dealerGain17)
printGainMatrix(playerOptimalGain17)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,-0.07,0.05,0.08,0.1,0.13,0.16,0.07,0.01,-0.07,-0.16
2,-0.19,-0.08,-0.05,-0.02,0.01,0.05,-0.05,-0.12,-0.2,-0.28
3,-0.21,-0.09,-0.06,-0.03,0.0,0.04,-0.08,-0.14,-0.22,-0.3
4,-0.23,-0.11,-0.08,-0.04,-0.01,0.03,-0.11,-0.17,-0.24,-0.32
5,-0.24,-0.12,-0.09,-0.05,-0.02,0.02,-0.14,-0.2,-0.27,-0.34
6,-0.25,-0.13,-0.09,-0.06,-0.03,0.01,-0.17,-0.22,-0.29,-0.37
7,-0.22,-0.1,-0.07,-0.03,0.0,0.03,-0.08,-0.21,-0.27,-0.34
8,-0.14,-0.02,0.01,0.04,0.07,0.1,0.06,-0.07,-0.21,-0.28
9,-0.06,0.07,0.09,0.12,0.15,0.18,0.13,0.08,-0.06,-0.21
10,0.04,0.16,0.18,0.21,0.23,0.26,0.21,0.16,0.09,-0.05


On peut à présent calculer le gain global du joueur

In [120]:
def computePlayerGlobalGain(playerGainMatrix: np.ndarray) -> float:
	"""
	Compute the global gain for the player
	"""
	gain = 0
	for card1_player in card_values:
		for card2_player in card_values:
			for card_dealer in card_values:
				player_score = card1_player + card2_player
				gain += playerGainMatrix[player_score-1, card_dealer-1] * probaDraw(card1_player) * probaDraw(card2_player) * probaDraw(card_dealer)
	return gain

In [123]:
gain = computePlayerGlobalGain(playerOptimalGain)
gain17 = computePlayerGlobalGain(playerOptimalGain17)
print(gain)
print(gain17)


-0.1304676807062667
-0.057426856004846094
