# Blackjack Implementation

Assumptions:
- Drawing from an infinite deck (doesn't matter if we count cards)
- Useable Ace vs Ace

In [41]:
import numpy as np

class Player():
    def __init__(self):
        self.cards = np.array([1, 2], dtype="int")
        self.hit(2)
    
    def value(self):
        hand_sum = np.sum(self.cards)
        if 1 not in self.cards:
            return (hand_sum, False)

        aces = np.bincount(self.cards)[1]
        usable_ace = hand_sum - (aces - 1) <= 10
        if usable_ace:
            return (hand_sum + 10, True)
        
        return (hand_sum, False)
        
    def hit(self, n=1):
        for i in range(n):
            random_card = np.random.choice([x for x in range(1, 11)], 1, p=[1/13 for x in range(1, 10)] + [4/13])
            self.cards = np.append(self.cards, [random_card])

    def bust(self):
        return self.value() > 21

class Blackjack():
    def __init__(self, model):
        self.player = Player()
        self.dealer = Player()
        self.model = model

    def reward(self):
        if self.player.bust():
            return -1

        if self.dealer.bust():
            return 1

        if self.player.value() >= self.dealer.value():
            return 1

        return 0
    
    def play(self):
        # Player's Turn
        while True:
            player_hit = model.hit(self.player.value(), self.dealer.cards[0])
            if player_hit:
                self.player.hit()
                if self.player.bust():
                    break
                continue
            break
            
        # Dealer's Turn
        while True:
            dealer_hit = self.dealer.value() < 17
            if dealer_hit:
                self.dealer.hit()
                if self.dealer.bust():
                    break
                continue
            break
        
        return self.reward()

# Setup

In [None]:
class Policy():
    def action_prob(self,state:int,action:int) -> float:
        """
        input:
            state, action
        return:
            \pi(a|s)
        """
        raise NotImplementedError()

    def action(self,state:int) -> int:
        """
        input:
            state
        return:
            action
        """
        raise NotImplementedError()

# Monte Carlo

## Variations
- Exploring Starts (for pi)
    - First-Visit MC (for v)
    - Every-Visit MC (for v)
- On-Policy MC Control (for pi)
    - First-Visit MC (for v)
    - Every-Visit MC (for v)
- Off-Policy (for pi) (Importance Sampling)
    - MC Control
    - Incremental Implementation

In [None]:
class MonteCarlo():
    def __init__(self):
        self.Q = np.zeros((21, 2, 11, 2))
        self.C = np.zeros((21, 2, 11, 2))

    def estimate(self):
        

# Temporal Difference

## Sarsa

## Q-Learning

## Expected Sarsa