In [1]:
#!pip install gymnasium

In [2]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm

import gymnasium as gym

In [3]:
# Agent
from keras.models import Model, Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam
from collections import deque

class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.tau = 0.01
        self.batch_size = 64
        self.memory = deque(maxlen=10000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.leaning_rate = 0.001
        self.model = self.build_model()
        self.target_model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(128, input_dim = self.state_size, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='relu'))
        model.compile(loss='mse',optimizer='adam')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def replay(self):
        minibatch = self.sample(self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                predicted_action = np.argmax(self.model.predict(next_state,verbose=0)[0])
                target = reward + self.gamma * self.target_model.predict(next_state, verbose=0)[0][predicted_action]
            target_f = self.model.predict(state, verbose=0)
            target_f[0][action] = target
            self.model.fit(state,target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1- self.tau)
        self.target_model.set_weights(target_weights)

    def save(self, file):
        self.model.save_weights(file)

    def load(self, file):
        self.model.load_weights(file)
            

2023-12-11 13:19:44.622060: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 13:19:44.657564: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 13:19:44.658210: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Train
env = gym.make("Blackjack-v1", sab=True)
state_size_, action_size_ = 3, 2
dqn_agent = DDQNAgent(state_size_, action_size_)
try:
    dqn_agent.load('model.h5')
except:
    print("No previous model detected")
n_episodes = 15000
n_steps = 10
stand, hit = 0, 0
bj_win = 0
bj_games = 0
money = 0
capacity = 10000
cur_money = money
total_step = 0
mon = []

for episode in tqdm(range(n_episodes)):
    bj_games += 1
    cur_state_, _ = env.reset()
    for step in range(n_steps):
        total_step += 1
        cur_state_ = np.reshape(cur_state_, [1, state_size_])
        action = dqn_agent.act(cur_state_)
        if action == 0:
            stand += 1
        else:
            hit += 1
        observation, reward, done, _, _ = env.step(action)
        observation = np.reshape(observation, [1, state_size_])
        dqn_agent.remember(cur_state_, action, reward, observation, done)
        if total_step % dqn_agent.batch_size == 0:
            dqn_agent.replay()
            dqn_agent.update_target_model()
        cur_state_ = observation
        if done:
            if reward > 0:
                bj_win += 1
            money += reward
            break
    if episode % 100 == 0:
        money_change = money - cur_money
        mon.append(money_change)
        dqn_agent.save("model.h5")
        print(f"Stands: {stand}, hits: {hit}, win: {bj_win/bj_games}")
        stand = 0
        hit = 0
        bj_win = 0
        bj_games = 0
        cur_money = money

  0%|                                                                                                                                                              | 0/15000 [00:00<?, ?it/s]

Stands: 1, hits: 0, win: 1.0


  1%|▉                                                                                                                                                    | 97/15000 [00:11<29:31,  8.41it/s]

Stands: 62, hits: 70, win: 0.22


  1%|█▉                                                                                                                                                  | 195/15000 [00:23<29:11,  8.45it/s]

Stands: 73, hits: 58, win: 0.36


  2%|██▉                                                                                                                                                 | 295/15000 [00:35<28:58,  8.46it/s]

Stands: 74, hits: 68, win: 0.28


  3%|███▊                                                                                                                                                | 386/15000 [00:48<31:36,  7.70it/s]

Stands: 67, hits: 58, win: 0.29


  3%|████▋                                                                                                                                               | 479/15000 [00:59<30:43,  7.88it/s]

Stands: 67, hits: 71, win: 0.26


  4%|██████                                                                                                                                              | 612/15000 [01:11<23:20, 10.28it/s]

Stands: 72, hits: 47, win: 0.3


  5%|███████▏                                                                                                                                            | 728/15000 [01:24<17:58, 13.23it/s]

Stands: 71, hits: 58, win: 0.26


  5%|████████                                                                                                                                            | 821/15000 [01:35<19:21, 12.20it/s]

Stands: 68, hits: 71, win: 0.29


  6%|████████▊                                                                                                                                           | 899/15000 [01:47<25:08,  9.35it/s]

Stands: 79, hits: 59, win: 0.34


  7%|█████████▊                                                                                                                                         | 1006/15000 [01:58<19:56, 11.70it/s]

Stands: 77, hits: 65, win: 0.35


  7%|██████████▉                                                                                                                                        | 1118/15000 [02:18<30:03,  7.70it/s]

Stands: 77, hits: 61, win: 0.34


  8%|████████████▏                                                                                                                                      | 1239/15000 [02:31<20:12, 11.35it/s]

Stands: 65, hits: 66, win: 0.28


  9%|████████████▊                                                                                                                                      | 1305/15000 [02:44<37:36,  6.07it/s]

Stands: 75, hits: 53, win: 0.24


  9%|█████████████▏                                                                                                                                     | 1345/15000 [02:49<28:45,  7.91it/s]


KeyboardInterrupt: 

In [None]:
# ax = np.linspace(0, n_episodes,num = n_episodes//150)
plt.plot(mon)
plt.title('Rewards',fontsize=12)
plt.grid()
plt.show()
print(money)
print(bj_win)