In [20]:
import gym
import numpy as np
import numpy as np
import scipy.interpolate as interp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import PyQt5
import random

%matplotlib qt

## TD(lambda) Value Function Iteration

In [21]:
# Prediction
class BlackJack:
    def __init__(self, MAX_EPISODES=500000):
        self.env = gym.make('Blackjack-v0')
        self.MAX_EPISODES = MAX_EPISODES
        self.ALPHA = 0.5
        self.GAMMA = 0.95
        self.LAMBDA = 0.5
        
        # State values
        self.v = dict({})
        for p_hand in range(self.env.observation_space[0].n):
            for d_card in range(self.env.observation_space[1].n):
                for useable_ace in range(self.env.observation_space[2].n):
                    string = str(p_hand) + '_' + str(d_card) + '_' + str(useable_ace)
                    self.v[string] = 0 # Initiate to 0

        # Eligibility traces
        self.e = dict({})
        for p_hand in range(self.env.observation_space[0].n):
            for d_card in range(self.env.observation_space[1].n):
                for useable_ace in range(self.env.observation_space[2].n):
                    string = str(p_hand) + '_' + str(d_card) + '_' + str(useable_ace)
                    self.e[string] = 0 # Initiate to 0
            
    def action(self, player_hand):
        """
        NB: What about the usable ace?
        """
        probs = [0.8, 0.2] if player_hand > 18 else [0.2, 0.8]
        action = np.random.choice(np.arange(2), p=probs)
        return action

    def run_episode(self):
        new_s = self.env.reset()
        done = False
        while not done:
            s = new_s
            s_key = str(s[0]) + '_' + str(s[1]) + '_' + str(1*s[2])
            
            # Choose action and take a step
            a = self.action(s[0])
            new_s, r, done, _ = self.env.step(a)
            new_s_key = str(new_s[0]) + '_' + str(new_s[1]) + '_' + str(1*new_s[2])
            
            # Update eligibilities
            self.e.update((x, y * self.LAMBDA * self.GAMMA) for x, y in self.e.items())
            self.e[s_key] += 1
            
            # Update value function according to 
            # the td-error and update their eligibilities.
            td_error = r + self.GAMMA * self.v[new_s_key] - self.v[s_key]
            self.v[s_key] += self.ALPHA * td_error * self.e[s_key]
                 
    def value_iteration(self):
        for episode in range(self.MAX_EPISODES):
            if episode % 10000 == 0:
                print(f"Value Iteration {episode}")
            self.run_episode()

In [22]:
bj = BlackJack()
bj.value_iteration()

Value Iteration 0
Value Iteration 10000
Value Iteration 20000
Value Iteration 30000
Value Iteration 40000
Value Iteration 50000
Value Iteration 60000
Value Iteration 70000
Value Iteration 80000
Value Iteration 90000
Value Iteration 100000
Value Iteration 110000
Value Iteration 120000
Value Iteration 130000
Value Iteration 140000
Value Iteration 150000
Value Iteration 160000
Value Iteration 170000
Value Iteration 180000
Value Iteration 190000
Value Iteration 200000
Value Iteration 210000
Value Iteration 220000
Value Iteration 230000
Value Iteration 240000
Value Iteration 250000
Value Iteration 260000
Value Iteration 270000
Value Iteration 280000
Value Iteration 290000
Value Iteration 300000
Value Iteration 310000
Value Iteration 320000
Value Iteration 330000
Value Iteration 340000
Value Iteration 350000
Value Iteration 360000
Value Iteration 370000
Value Iteration 380000
Value Iteration 390000
Value Iteration 400000
Value Iteration 410000
Value Iteration 420000
Value Iteration 430000
Va

In [23]:
X = []
Y = []
Z = []
for key, value in bj.v.items():
    if (int('2_10_1'[-1])) & (int(key.split('_')[0]) > 11) & (int(key.split('_')[0]) < 22) & (int(key.split('_')[1]) < 12):  # Usable ace
        X.append(int(key.split('_')[0]))
        Y.append(int(key.split('_')[1]))
        Z.append(value)        
        
plotx, ploty, = np.meshgrid(np.linspace(np.min(X),np.max(X),10),\
                           np.linspace(np.min(Y),np.max(Y),10))
plotz = interp.griddata((X,Y),Z,(plotx,ploty),method='linear')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(plotx,ploty,plotz,cstride=1,rstride=1,cmap='viridis')  # or 'hot'

<mpl_toolkits.mplot3d.art3d.Poly3DCollection at 0x123926110>

## Backward View SARSA(lambda)

In [None]:
# Prediction
class BlackJackSARSA:
    def __init__(self, MAX_EPISODES=500000):
        self.env = gym.make('Blackjack-v0')
        self.MAX_EPISODES = MAX_EPISODES
        self.ALPHA = 0.5
        self.GAMMA = 0.95
        self.LAMBDA = 0.5
        
        # State values
        self.q = dict({})
        for p_hand in range(self.env.observation_space[0].n):
            for d_card in range(self.env.observation_space[1].n):
                for useable_ace in range(self.env.observation_space[2].n):
                    for action in range(self.env.action_space.n)
                        string = str(p_hand) + '_' + str(d_card) + '_' + str(useable_ace) + '_' + str(action)
                        self.q[string] = 0 # Initiate to 0

        # Eligibility traces
        self.e = dict({})
        for p_hand in range(self.env.observation_space[0].n):
            for d_card in range(self.env.observation_space[1].n):
                for useable_ace in range(self.env.observation_space[2].n):
                    for action in range(self.env.action_space.n)
                        string = str(p_hand) + '_' + str(d_card) + '_' + str(useable_ace) + '_' + str(action)
                        self.e[string] = 0 # Initiate to 0
            
    def action(self, player_hand):
        """
        NB: What about the usable ace?
        """
        probs = [0.8, 0.2] if player_hand > 18 else [0.2, 0.8]
        action = np.random.choice(np.arange(2), p=probs)
        return action

    def greedy_action(self, state):
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def epsilon_greedy(self, state):
        if np.random.random() >= self.eps:
            return self.greedy_action(state)

        return self.env.action_space.sample()    
    
    def run_episode(self):
        new_s = self.env.reset()
        a = random.sample([0, 1], k=1)[0]
        done = False
        while not done:
            s = new_s
            s_a_key = str(s[0]) + '_' + str(s[1]) + '_' + str(1*s[2]) + '_' + str(a)

            # Take action a
            new_s, r, done, _ = self.env.step(a)
            new_s_a_key = str(new_s[0]) + '_' + str(new_s[1]) + '_' + str(1*new_s[2]) + '_' + str(a)
            
            # Choose new action from S'
            a = self.epsilon_greedy(new_s)
            
            # Update eligibilities
            self.e.update((x, y * self.LAMBDA * self.GAMMA) for x, y in self.e.items())
            self.e[s_key] += 1
            
            # Update value function according to 
            # the td-error and update their eligibilities.
            td_error = r + self.GAMMA * self.v[new_s_key] - self.v[s_key]
            self.v[s_key] += self.ALPHA * td_error * self.e[s_key]
                 
    def value_iteration(self):
        for episode in range(self.MAX_EPISODES):
            if episode % 10000 == 0:
                print(f"Value Iteration {episode}")
            self.run_episode()