In [4]:
import numpy as np
import sys
if "../../" not in sys.path:
  sys.path.append("../../") 
from lib.envs.blackjack import BlackjackEnv
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
from matplotlib import cm

In [5]:
%matplotlib widget

In [50]:
class Model():
    
    def __init__(self):
        
        self.is_training = True
        self.epsilon = .1
        self.env = BlackjackEnv()
        # Set of state
        self.set_of_states = {
            "dealer_showing": (1,10),
            "player_sum": (12,21),
            "usable_ace": (0,1),
        }
        # Value function
        self.V = np.zeros((   
            self.set_of_states["dealer_showing"][1]-self.set_of_states["dealer_showing"][0]+1,
            self.set_of_states["player_sum"][1]-self.set_of_states["player_sum"][0]+1,
            self.set_of_states["usable_ace"][1]-self.set_of_states["usable_ace"][0]+1))
        
        # State-Action function
        self.Q = np.random.random((   
            self.set_of_states["dealer_showing"][1]-self.set_of_states["dealer_showing"][0]+1,
            self.set_of_states["player_sum"][1]-self.set_of_states["player_sum"][0]+1,
            self.set_of_states["usable_ace"][1]-self.set_of_states["usable_ace"][0]+1,
            2
        ))+2
        
        self.N = np.zeros(self.Q.shape)
        
        self.policy = np.random.choice([0,1],(   
            self.set_of_states["dealer_showing"][1]-self.set_of_states["dealer_showing"][0]+1,
            self.set_of_states["player_sum"][1]-self.set_of_states["player_sum"][0]+1,
            self.set_of_states["usable_ace"][1]-self.set_of_states["usable_ace"][0]+1))
        #self.policy[:,:,:] = 1
        #self.policy[:,-2:,:] = 0
        self.gamma = 1
        
        
        self.Returns = []
        
        for i in range(self.Q.shape[0]):
            a = []
            for j in range(self.Q.shape[1]):
                b = []
                for k in range(self.Q.shape[2]):
                    c = []
                    for l in range(self.Q.shape[3]):
                        c.append([])
                    b.append(c)
                a.append(b)
            self.Returns.append(a)
            
        self.Returns_V = []
        
        for i in range(self.V.shape[0]):
            a = []
            for j in range(self.V.shape[1]):
                b = []
                for k in range(self.V.shape[2]):
                    b.append([])
                a.append(b)
            self.Returns_V.append(a)
        
        
    def iterate_policy(self, episodes):
        
        for i in tqdm(range(episodes)):
            
            states, rewards, actions = self.generate_episode()
            states = states[:-1]            
            
            G = 0
            for t in range(0,len(states))[::-1]:
                G = self.gamma * G + rewards[t]
                
                #print(f"G:{G}")
                case_pos = (states[t][0]-self.set_of_states["dealer_showing"][0],states[t][1]-self.set_of_states["player_sum"][0],states[t][2]-self.set_of_states["usable_ace"][0])
                
                self.N[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])] += 1
                self.Q[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])] += (G-self.Q[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])]) /self.N[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])]
                self.policy = np.argmax(self.Q,axis=-1)
                
                #self.Returns[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])].append(G)
                #self.Q[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])] = np.mean(self.Returns[case_pos[0]][case_pos[1]][case_pos[2]][int(actions[t])])
                #self.policy = np.argmax(self.Q,axis=-1)
            
            
    def strategy(self, observation):
        state = [observation[1],observation[0],int(observation[2])]
        case_pos = (state[0]-self.set_of_states["dealer_showing"][0],state[1]-self.set_of_states["player_sum"][0],state[2]-self.set_of_states["usable_ace"][0])
        
        if self.is_training:
            return(np.random.choice([0,1,self.policy[case_pos[0]][case_pos[1]][case_pos[2]]],p=[self.epsilon/2,self.epsilon/2,1-self.epsilon]))
        else:
            return(self.policy[case_pos[0]][case_pos[1]][case_pos[2]])

    def print_observation(self, observation):
        score, dealer_score, usable_ace = observation
        print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format(
              score, usable_ace, dealer_score))
        
    def aproximateV(self,episodes):
        for i in tqdm(range(episodes)):
            states, rewards, actions = self.generate_episode()
            #print(f"States: {states}")
            #print(f"Rewards: {rewards}")
            #print(f"Actions: {actions}")
            states = states[:-1]
            G = 0
            for t in range(0,len(states))[::-1]:
                
                G = self.gamma * G + rewards[t]
                #print(states[t])
                case_pos = (states[t][0]-self.set_of_states["dealer_showing"][0],states[t][1]-self.set_of_states["player_sum"][0],states[t][2]-self.set_of_states["usable_ace"][0])
                
                #self.NV[case_pos[0]][case_pos[1]][case_pos[2]] += 1
                #self.V[case_pos[0]][case_pos[1]][case_pos[2]] += (G-self.V[case_pos[0]][case_pos[1]][case_pos[2]]) /self.NV[case_pos[0]][case_pos[1]][case_pos[2]]
                self.Returns_V[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]].append(G)
                self.V[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]] = np.mean(self.Returns_V[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]])
            
            
            
    def generate_episode(self):
        observation = self.env.reset()
        #self.print_observation(observation)
        states, rewards, actions = [], [], []
        states.append([observation[1],observation[0],int(observation[2])])
        
        for t in range(100):
            
            action = self.strategy(observation)
            actions.append(action)
            #print(f"action: {action}")
            
            observation, reward, done, _ = self.env.step(action)
            rewards.append(reward)
            states.append([observation[1],observation[0],int(observation[2])])
            
            #self.print_observation(observation)
            if done:
                
                #print("Game end. Reward: {}\n".format(float(reward)))
                break
        return states, rewards, actions

In [51]:
#0 stay
#1 hit

In [None]:
m = Model()
m.iterate_policy(500000)

  0%|          | 0/500000 [00:00<?, ?it/s]

In [48]:

m.aproximateV(100000)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [49]:
fig = plt.figure(figsize=plt.figaspect(0.5))
#fig.set_size_inches(10, 10)

# Make data.
X = np.arange(12,22)
Y = np.arange(1,11)
X, Y = np.meshgrid(X, Y)

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(X, Y, m.V[:,:,0], linewidth=0, antialiased=False,cmap=cm.plasma)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(X, Y, m.V[:,:,1], linewidth=0, antialiased=False,cmap=cm.plasma)


plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [46]:
fig = plt.figure(figsize=plt.figaspect(0.5))
#fig.set_size_inches(10, 10)

# Make data.
X = np.arange(12,22)
Y = np.arange(1,11)
X, Y = np.meshgrid(X, Y)

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(X, Y, m.policy[:,:,0], linewidth=0, antialiased=False,cmap=cm.plasma)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(X, Y, m.policy[:,:,1], linewidth=0, antialiased=False,cmap=cm.plasma)


plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [47]:
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1)
ax.imshow(m.policy[:,:,0],cmap="gray")
ax = fig.add_subplot(1, 2, 2)
ax.imshow(m.policy[:,:,1],cmap="gray")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …