In [65]:
import numpy as np 
import matplotlib.pyplot as plt
from tqdm import tqdm 
from matplotlib import cm

In [56]:
%matplotlib widget

In [7]:
list(range(0,4)[::-1])

[3, 2, 1, 0]

In [57]:
class Model():
    def __init__(self):
        self.cards = [1,2,3,4,5,6,7,8,9,10,10,10,10]
        self.set_of_states = {
            "dealer_showing": (1,10),
            "player_sum": (12,21),
            "usable_ace": (0,1),
        }
        self.V = np.zeros((   
            self.set_of_states["dealer_showing"][1]-self.set_of_states["dealer_showing"][0]+1,
            self.set_of_states["player_sum"][1]-self.set_of_states["player_sum"][0]+1,
            self.set_of_states["usable_ace"][1]-self.set_of_states["usable_ace"][0]+1))
        
        self.policy = np.zeros((   
            self.set_of_states["dealer_showing"][1]-self.set_of_states["dealer_showing"][0]+1,
            self.set_of_states["player_sum"][1]-self.set_of_states["player_sum"][0]+1,
            self.set_of_states["usable_ace"][1]-self.set_of_states["usable_ace"][0]+1))
        
        self.policy[:,-2:,:] = 1
        self.Returns = []
        
        self.gamma = .5
        
        
        for i in self.V:
            a = []
            for j in self.V:
                b = []
                for k in self.V:
                    b.append([])
                a.append(b)
            self.Returns.append(a)
        self.R = self.V.copy()
        
        
        
    
    def aproximateV(self,episodes):
        for i in tqdm(range(episodes)):
            states, rewards, actions = self.generate_episode()
            G = 0
            for t in range(0,len(states))[::-1]:
                G = self.gamma * G + rewards[t]
                #print(states[t])
                self.Returns[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]].append(G)
                self.V[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]] = np.mean(self.Returns[states[t][0]-self.set_of_states["dealer_showing"][0]][states[t][1]-self.set_of_states["player_sum"][0]][states[t][2]-self.set_of_states["usable_ace"][0]])
            
            
    def generate_episode(self):
        
        dfc = np.random.choice(self.cards)
        dealer_sum = dfc + np.random.choice(self.cards)
        state = (dfc, 11 + np.random.choice(self.cards), 1)
        states = []
        rewards = []
        actions = []
        
        #print(f"dealer_sum:{dealer_sum}")
        #print(f"player_sum:{state[1]}")
        while True:
            states.append(state)
            actions.append(self.policy[state[0]-self.set_of_states["dealer_showing"][0],state[1]-self.set_of_states["player_sum"][0],state[2]-self.set_of_states["usable_ace"][0]])
            state,r,dealer_sum = self.transition_model(state, dealer_sum)
            rewards.append(r)
            
            #print(f"dealer_sum:{dealer_sum}")
            #print(f"state:{state}")
            #print(f"reward:{r}")
            
            if state[0] == -1:
                break
        #print(f"states:{states}")
        #print(f"rewards:{rewards}")
        #print(f"actions:{actions}")
        return states, rewards, actions
                
                
    def transition_model(self, state, dealer_sum):
        
        #Player move
        reward = 0
        #print(f'state_index:{(state[0]-self.set_of_states["dealer_showing"][0],state[1]-self.set_of_states["player_sum"][0],state[2]-self.set_of_states["usable_ace"][0])}')
        if self.policy[state[0]-self.set_of_states["dealer_showing"][0],state[1]-self.set_of_states["player_sum"][0],state[2]-self.set_of_states["usable_ace"][0]] == 0: #take   
            #print("Player hits")
            player_sum = state[1] + np.random.choice(self.cards)
            #print(f"player_sum:{player_sum}")
            if player_sum > 21 and state[2] == 1:
                player_sum -= 10
                next_state = (state[0],player_sum,0)
            elif  player_sum <= 21:
                next_state = (state[0],player_sum,state[2])
            else:
                #player loses
                next_state = (-1,player_sum,-1)
                reward = -1
                #print("Player burst")
        else:
            #print("Player stiks")
            player_sum = state[1]
            next_state = state
            
        # Dealer move
        if dealer_sum < 17:
            #print("Dealer hits")
            dealer_sum += np.random.choice(self.cards)
            #print(f"dealer toma:{dealer_sum}")
            if dealer_sum > 21:
                next_state = (-1,player_sum,-1)
                reward = 1
                
                #print("Dialer burst")
        
        if  self.policy[state[0]-self.set_of_states["dealer_showing"][0],state[1]-self.set_of_states["player_sum"][0],state[2]-self.set_of_states["usable_ace"][0]] == 1 and dealer_sum <= 21: #End game
            if dealer_sum > player_sum:
                reward = -1
            elif dealer_sum == player_sum:
                reward = 0
            else:
                reward = 1
            next_state = (-1,player_sum,-1)
            #print("No more moves")
        return(next_state, reward, dealer_sum)
    
m = Model()

In [61]:
m.aproximateV(500000)

100%|██████████| 500000/500000 [10:25<00:00, 799.04it/s] 


In [63]:
m1 = Model()
m1.aproximateV(10000)

100%|██████████| 10000/10000 [00:01<00:00, 9270.04it/s]


In [73]:
fig = plt.figure(figsize=plt.figaspect(0.5))
fig.set_size_inches(10, 10)
ax = fig.add_subplot(2, 2, 1, projection='3d')

# Make data.
X = np.arange(12,22)
Y = np.arange(1,11)
X, Y = np.meshgrid(X, Y)

ax.plot_surface(X, Y, m.V[:,:,0], linewidth=0, antialiased=False,cmap=cm.plasma)

ax = fig.add_subplot(2, 2, 2, projection='3d')

ax.plot_surface(X, Y, m.V[:,:,1], linewidth=0, antialiased=False,cmap=cm.plasma)

ax = fig.add_subplot(2, 2, 3, projection='3d')

ax.plot_surface(X, Y, m1.V[:,:,0], linewidth=0, antialiased=False, cmap=cm.coolwarm)

ax = fig.add_subplot(2, 2, 4, projection='3d')

ax.plot_surface(X, Y, m1.V[:,:,1], linewidth=0, antialiased=False, cmap=cm.coolwarm)

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …