In [0]:
import numpy as np

# for plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

# visualize plots in the jupyter notebook
# check more https://goo.gl/U3Ai8R
%matplotlib inline

# for generating random values
import random

# for representing things like card value or colors
from enum import Enum  

# for copying python objects
import copy

In [0]:
def plot_value_function(agent_type, value_function, method, title='Value Function', usables=[0,0,0], generate_gif=False, train_steps=None, save=None, transpose=True):
    """
    Plots a value function as a surface plot, like in: https://goo.gl/aF2doj

    You can choose between just plotting the graph for the value function
    which is the default behaviour (generate_gif=False) or to train the agent
    a couple of times and save the frames in a gif as you train.

    Args:
        agent: An agent.
        title (string): Plot title.
        generate_gif (boolean): If want to save plots as a gif.
        train_steps: If is not None and generate_gif = True, then will use this
                     value as the number of steps to train the model at each frame.
    """
    # you can change this values to change the size of the graph
    title += ' (' + str(train_steps) + ' Episodes, Usables ' + str(usables) + ', ' + method + ')'
    title = agent_type + ' ' + title
    fig = plt.figure(title, figsize=(10, 5))
    
    # explanation about this line: https://goo.gl/LH5E7i
    ax = fig.add_subplot(111, projection='3d')
    
    if transpose:
        V = np.transpose(value_function[:,:,usables[0],usables[1],usables[2]])
    else:
        V = value_function[:,:,usables[0],usables[1],usables[2]]
    
    if generate_gif:
        print('gif will be saved as %s' % title)

    def plot_frame(ax):
        # min value allowed accordingly with the documentation is 1
        # we're getting the max value from V dimensions
        min_x = 1
        max_x = V.shape[0]
        min_y = 1
        max_y = V.shape[1]

        # creates a sequence from min to max
        x_range = np.arange(min_x, max_x)
        y_range = np.arange(min_y, max_y)

        # creates a grid representation of x_range and y_range
        X, Y = np.meshgrid(x_range, y_range)

        # get value function for X and Y values
        def get_stat_val(x, y):
            return V[x, y]
        Z = get_stat_val(X, Y)

        # creates a surface to be ploted
        # check documentation for details: https://goo.gl/etEhPP
        ax.set_xlabel('Dealer Showing')
        ax.set_ylabel('Player Sum')
        ax.set_zlabel('Value')
        return ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, 
                               linewidth=0, antialiased=False)
        
    def animate(frame):
        # clear the plot and create a new surface
        ax.clear()
        surf = plot_frame(ax)
        # if we're going to generate a gif we need to train a couple of times
        if generate_gif:
            i = agent.iterations
            # cool math to increase number of steps as we go
            if train_steps is None:
                step_size = int(min(max(1, agent.iterations), 2 ** 16))
            else:
                step_size = train_steps

            agent.train(step_size)
            plt.title('%s MC score: %s frame: %s' % (title, float(agent.wins)/agent.iterations*100, frame))
        else:
            plt.title(title)

        fig.canvas.draw()
        return surf

    ani = animation.FuncAnimation(fig, animate, 32, repeat=False)

    # requires gif writer
    if generate_gif:
        ani.save(title + '.gif', writer='imagemagick', fps=3)
    else:
        if save is None:
            plt.show()
            plt.close()
        else:
            plt.savefig(save)
            plt.close('all')

In [0]:
class Color(Enum):
    RED = 0
    BLACK = 1


class Card(object):
    def __init__(self, color=None):
        self.value = self._get_random_value()
        if color == Color.BLACK or color == Color.RED:
            self.color = color
        else:
            self.color = self._get_random_color()

    def _get_random_value(self):
        """Generates integers between 1 and 10."""
        return random.randint(1, 10)
    
    def _get_random_color(self):
        """Generates random colors.
        
        Color.RED with 1/3 and Color.BLACK with 2/3 probability.
        """
        random_number = random.random()
        if random_number <= 1/3.0:
            return Color.RED
        else:
            return Color.BLACK
class Deck(object):
    def sample_card(self, color=None, value=None):
        return Card(color, value)

In [0]:
class Player(object):
    def policy(self, state):
        raise NotImplemented()

class Dealer(object):
    def policy(self, state):
        if(state.dealer_sum >= 25):
            return Action.STICK
        else:
            return Action.HIT

In [0]:
class State(object):
    def __init__(self, agent_sum=0, show_card=0, agent_usables=[0,0,0], dealer_sum=0, dealer_usables=[0,0,0], is_terminal=False):
        self.show_card = show_card
        self.agent_sum = agent_sum
        self.dealer_sum = dealer_sum
        self.agent_usables = agent_usables
        self.dealer_usables = dealer_usables
        self.is_terminal = is_terminal
        
    def get_state(self):
        usables = self.agent_usables
        return (self.agent_sum, self.show_card, usables[0], usables[1], usables[2])

    
class Action(Enum):
    HIT = 0
    STICK = 1

In [0]:
class Environment(object):
  def __init__(self):
    self.deck=Deck()
    self.dealer=Dealer()
    self.agent_max_value=31
    self.dealer_max_value = 10  # max value the dealer can get when taking the first card
    self.usable_1_values = 3 # can take states 0(not-present), 1(usable) or 2(non-usable) for card-value 1
    self.usable_2_values = 3 # can take states 0(not-present), 1(usable) or 2(non-usable) for card-value 2
    self.usable_3_values = 3 # can take states 0(not-present), 1(usable) or 2(non-usable) for card-value 3
    self.actions_count = 2  # number of possible actions in each state

  def take_card(self, color=None, value=None):
        card = self.deck.sample_card(color, value)
        if(card.color == Color.BLACK):
            return card.value
        else:
            return -card.value
  def check_bust(self,sum):

      return (sum > 0 ) or (sum < 31)

  def update_agent(self,s,card_value):
    if (card_value==1) or (card_value==2) or (card_value==3):
      id=card_value-1
      current_usable=s.agent_usable[id]
      if(current_usable == 0):
                # previously not present
                # value can either be new_value or (new_value + 10)
                if self.check_bust(s.agent_sum + card_value + 10):
                    # going bust when using (new_value + 10)
                    # make it non-usable
                    new_s = 2
                else:
                    # can use it as a usable card, add 10 and make it usable
                    card_value += 10
                    new_s = 1
      elif(current_usable == 1):
                # previously used as a usable
                new_s = 2
      else:
            new_s = 2
            s.agent_usables[idx] = new_s
    return s, card_value

  def update_dealer(self,s,card_value):
    if (card_value==1) or (card_value==2) or (card_value==3):
      id=card_value-1
      current_usable=s.dealer_usable[id]
      if(current_usable == 0):
                # previously not present
                # value can either be new_value or (new_value + 10)
        if self.check_bust(s.dealer_sum + card_value + 10):
                    # going bust when using (new_value + 10)
                    # make it non-usable
          new_s = 2
        else:
                  # can use it as a usable card, add 10 and make it usable
          card_value += 10
          new_s = 1
      elif(current_usable == 1):
                # previously used as a usable
        new_s = 2
      else:
        new_s = 2
        s.dealer_usables[idx] = new_s
      return s, card_value

  def initial_state(self):
     agent_pick=self.take_card()
     dealer_pick=self.take_card()
     s = State(agent_usables=[0,0,0], dealer_usables=[0,0,0])
     s,agent_pick=self.update_agent(copy.copy(s), agent_pick)
     s,dealer_pick = self.update_dealer(copy.copy(s), dealer_pick)
     s.agent_sum += agent_pick
     s.dealer_sum += dealer_pick
     s.show_card = dealer_pick
     agent_busted = self.check_bust(s.agent_sum)
     dealer_busted = self.check_bust(s.dealer_sum)
     if (agent_busted or dealer_busted):
            state.is_terminal = True
     else:
            state.is_terminal = False
     return state
  def dealer_turn(self,s):

        action = None
        while not s.is_terminal and action != Action.STICK:
            action = self.dealer.policy(s)
            if action == Action.HIT:
              card=self.take_card() 
              state,dealer_value = self.update_dealer(copy.copy(s), card)
              s.dealer_sum += dealer_value
            s.is_terminal = self.check_bust(s.dealer_sum)
        return s

  def get_reward_bust(self, state):
        if(state.agent_sum > state.dealer_sum):
            return 1
        elif(state.agent_sum == state.dealer_sum):
            return 0
        return -1

  def step(self,s,a):
    
     # initially there's no reward and the next_s is equal to the
        # current state
        agent_sum=s.agent_sum
        r = 0
        next_s = copy.copy(s)

        # if the player sticks then it's dealer turn
        if(s.is_terminal):
            print('terminal state')
        if a == Action.STICK:
            next_s = self.dealer_turn(s)
            if next_s.is_terminal:
                r = 1
            else:
                next_s.is_terminal = True
                r = self.generate_reward_bust(next_s)       
        else:
            agent_card = self.sample_card_value()
            next_s, agent_card_value = self.update_agent_usables(copy.copy(s), agent_card)
            next_s.agent_sum += agent_card_value
            next_s.is_terminal = self.check_bust(next_s.agent_sum)
            if next_s.is_terminal:
                reward = -1
           
            if next_s.is_terminal:
                r = -1
        
        return next_s, r
