# Blackjack MDP

In [1]:
import numpy as np


In [2]:
class BlackJack:
    
    # define environment
    
    def __init__(self, alpha=0.1, epsilon=0.3):
        
        self.alpha = alpha
        self.epsilon = epsilon
        self.done = False
        self.available_actions = [0, 1]     #stick, twisht
        self.player_s_a = []
        self.state = (0, 0, False)     # starting state, player_value, show_card, usable_ace
        self.player_Q = {}        # hold Q values for each state in dict
        for i in range(12, 22):
            for j in range (1, 11):
                for k in [True, False]:
                    self.player_Q[(i, j, k)] = {}    # hold state in dict using tuple (player_value, show_card, usable_ace)
                    for a in [1, 0]:
                        if i == 21 and a == 0:    # player value = 21, action = stick
                            self.player_Q[(i, j, k)][a] = 1
                        else:
                            self.player_Q[(i,j,k)][a] = 0    # otherwise initialize to zero
                
    @staticmethod 
    def dealCard():
        card = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10])
        return card
    
    
    def dealHand(self, show=False):
        hand = [self.dealCard(), self.dealCard()]
        if 1 in hand:
            value = sum(hand) + 10
            usable_ace = True
        else:
            value = sum(hand)
            usable_ace = False
            
        if show:
            return value, usable_ace, hand[0]
        else: 
            return value, usable_ace
        
        
  
        
        
    def nextState(self, action):
        value = self.state[0]
        show_card = self.state[1]
        usable_ace = self.state[2]
        
        if action == 1:    # action = twist
            card = self.dealCard()
            if card == 1:
                if value < 11:
                    value +=11
                    usable_ace = True
                    
                else:
                    value +=1
            else:
                value += card
                    
        else:                           # action = stick
            self.done = True
            return (value, show_card, usable_ace)
        
        if value > 21:
            if usable_ace:
                value -= 10
                usable_ace = False
            else:
                self.done = True
                return (value, show_card, usable_ace)
                
        return (value, show_card, usable_ace)
                     
    def chooseAction(self):
        value = self.state[0]
        # if value <=11, twist
        if value <= 11:
            return 1
        
        if np.random.uniform(0, 1) <=  self.epsilon:     # exploratory move
            action = np.random.choice(self.available_actions)
        else:               # select greedy action
            v = -999
            action = 0
            for a in self.player_Q[self.state]:
                if self.player_Q[self.state][a] > v:
                    action = a
                    v = self.player_Q[self.state][a]
        return action
            
        
        
    
    
    def dealer(self, d_value, usable_ace, done):
        if d_value > 21:
            if usable_ace:
                d_value -= 10
                usable_ace = False
            else:
                return d_value, usable_ace, True
        if d_value >= 17:
            return d_value, usable_ace, True
        else:
            card = self.dealCard()
            if card == 1:
                if d_value < 11:
                    return d_value + 11, True, False
                return d_value + 1, usable_ace, False
            else:
                return d_value + card, usable_ace, False
    
    
    
    def identifyWinner(self, player_value, dealer_value):
        if player_value > 21:
            if dealer_value > 21:
                winner = 0   # draw
            else:
                winner = -1   # player wins
                
        else:
            if dealer_value > 21:
                winner = 1   # dealer wins
            elif player_value > dealer_value:
                winner = 1   # player wins
            elif dealer_value > player_value:
                winner = -1
            else:
                winner = 0
        return winner
        
        
        
    def allocateReward(self, player_value, dealer_value):
        
        reward = self.identifyWinner(player_value, dealer_value)
        #print("Reward from allocateReward", reward)
        
        # backfill the Q values dict
        for r in reversed(self.player_s_a):
            state, action = r[0], r[1]
            reward = self.player_Q[state][action] + self.alpha * (reward -self.player_Q[state][action])
            self.player_Q[state][action] = round(reward, 2)
                
        
        
    def reset(self):
        self.player_s_a = []
        self.state = (0, 0, False)
        self.done = False
        
        
    def trainModel(self, episodes=1000):
        for m in range(episodes):
            if m % 1000 == 0:
                print("Episode:", m)
                
            # deal hand with 2 cards
            p_value, p_usable_ace = self.dealHand(show=False)    # deal player hand
            #print("Player value:  {}, Usable ace: {}".format(p_value, p_usable_ace))
            d_value, d_usable_ace, show_card = self.dealHand(show=True)   # deal dealer hand
            #print("Dealer value:  {}, Usable ace: {}, Show card: {}".format(d_value, d_usable_ace, show_card))

            self.state = (p_value, show_card, p_usable_ace) 
            #print("state", self.state)

            # after dealing 2 card hands
            if p_value == 21 or d_value==21:
                continue
            else:
                while True:
                    action = self.chooseAction()
                    #print("State[0]", self.state[0])
                    if p_value >= 12:
                        player_s_a_pair = [self.state, action]
                        self.player_s_a.append(player_s_a_pair)
                    # next state
                    self.state = self.nextState(action)
                    if self.done:
                        break
                        
                # dealer plays
                hand_finished = False
                while not hand_finished:
                    d_value, d_usable_ace, hand_finished = self.dealer(d_value, d_usable_ace, hand_finished)
            
                # identify winner, award reward, update Q
                p_value = self.state[0]
                #print("Player value {}, Dealer value: {}".format(p_value, d_value))
                self.allocateReward(p_value, d_value)
                #print("Reward", round(reward, 2)
                
                
            self.reset()
        
    
    def play(self, games=1000):
        self.reset()
        self.epsilon = 0
        result = np.zeros(3)     # player win, draw, lose
        
        for game in range(games):        
            # deal hand with 2 cards
            p_value, p_usable_ace = self.dealHand(show=False)    # deal player hand
            #print("Player value:  {}, Usable ace: {}".format(p_value, p_usable_ace))
            d_value, d_usable_ace, show_card = self.dealHand(show=True)   # deal dealer hand
            #print("Dealer value:  {}, Usable ace: {}, Show card: {}".format(d_value, d_usable_ace, show_card))

            self.state = (p_value, show_card, p_usable_ace) 
            #print("state", self.state)

            # after dealing 2 card hands
            if p_value == 21 or d_value==21:
                if p_value == d_value:
                    result[1] += 1
                elif p_value > d_value:
                        result[0] += 1
                else:
                    result[2] += 1
                
            else:
                while True:
                    action = self.chooseAction()
                    self.state = self.nextState(action)
                    if self.done:
                        break
                        
                # dealer plays
                hand_finished = False
                while not hand_finished:
                    d_value, d_usable_ace, hand_finished = self.dealer(d_value, d_usable_ace, hand_finished)
            
                # identify winner, award reward, update Q
                p_value = self.state[0]
                winner = self.identifyWinner(p_value, d_value)
                if winner == 1:
                    result[0] +=1
                elif winner == 0:
                    result[1] +=1
                else:
                    result[2] +=1
                
                
            self.reset()
        
        return result
        

In [3]:
if __name__ == "__main__":
    # train model
    env = BlackJack()
    env.trainModel(500000)
    print("Finished training")
    
    result = env.play(2000)
    print(result)

Episode: 0
Episode: 1000
Episode: 2000
Episode: 3000
Episode: 4000
Episode: 5000
Episode: 6000
Episode: 7000
Episode: 8000
Episode: 9000
Episode: 10000
Episode: 11000
Episode: 12000
Episode: 13000
Episode: 14000
Episode: 15000
Episode: 16000
Episode: 17000
Episode: 18000
Episode: 19000
Episode: 20000
Episode: 21000
Episode: 22000
Episode: 23000
Episode: 24000
Episode: 25000
Episode: 26000
Episode: 27000
Episode: 28000
Episode: 29000
Episode: 30000
Episode: 31000
Episode: 32000
Episode: 33000
Episode: 34000
Episode: 35000
Episode: 36000
Episode: 37000
Episode: 38000
Episode: 39000
Episode: 40000
Episode: 41000
Episode: 42000
Episode: 43000
Episode: 44000
Episode: 45000
Episode: 46000
Episode: 47000
Episode: 48000
Episode: 49000
Episode: 50000
Episode: 51000
Episode: 52000
Episode: 53000
Episode: 54000
Episode: 55000
Episode: 56000
Episode: 57000
Episode: 58000
Episode: 59000
Episode: 60000
Episode: 61000
Episode: 62000
Episode: 63000
Episode: 64000
Episode: 65000
Episode: 66000
Episode: