In [1]:
#!pip install gymnasium

In [1]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm

import gymnasium as gym

In [2]:
# Agent
from keras.models import Model, Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam
from collections import deque
import gc

print("Change back to 1.0 epsilon")
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.tau = 0.1
        self.batch_size = 100
        self.memory = deque(maxlen=10000)
        self.gamma = 0.99
        self.epsilon = 0.1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.double_dqn = True

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        
        # model.add(Dense(128, input_dim = self.state_size, activation='relu'))
        # model.add(Dense(64, activation='relu'))
        # model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='relu'))
        model.compile(loss='mse',optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # if self.epsilon > self.epsilon_min:
        #     self.epsilon *= self.epsilon_decay
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def replay(self):
        minibatch = self.sample(self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state, verbose=0)
            if not done:
                # if self.double_dqn:
                    predicted_action = np.argmax(target[0])#self.model.predict(next_state,verbose=0)[0])
                    target_q = self.target_model.predict(next_state, verbose=0)[0][predicted_action]
                    target[0][action] = reward + self.gamma * target_q
                # else:
                #     target_q = self.target_model.predict(next_state, verbose=0)[0]
                #     target[0][action] = reward + self.gamma * max(target_q)
            else:
                target[0][action] = reward
            self.model.fit(state,target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        gc.collect()

    def update_target_model(self):
        #TODO: Try training without a target network
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1- self.tau)
        self.target_model.set_weights(target_weights)

    def save(self, file):
        self.model.save_weights(file)

    def load(self, file):
        self.model.load_weights(file)
        self.target_model.load_weights(file)
            

2023-12-16 21:21:26.080606: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-16 21:21:26.130539: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-16 21:21:26.131278: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Change back to 1.0 epsilon


In [None]:
# Train
env = gym.make("CartPole-v1")
state_size_, action_size_ = 4, 2
dqn_agent = DDQNAgent(state_size_, action_size_)
try:
    dqn_agent.load('cartpole.h5')
except:
    print("No previous model detected")
n_episodes = 1000
n_steps = 200
# capacity = 10000

streak_step = 0
total_step = 0

save_episodes = 10
for episode in tqdm(range(n_episodes)):
    cur_state_, _ = env.reset()
    for step in range(n_steps):
        streak_step += 1
        total_step += 1
        cur_state_ = np.reshape(cur_state_, [1, state_size_])
        action = dqn_agent.act(cur_state_)
        observation, reward, done, _, _ = env.step(action)
        observation = np.reshape(observation, [1, state_size_])
        if done:
            reward = -5
        dqn_agent.remember(cur_state_, action, reward, observation, done)
        if total_step % dqn_agent.batch_size == 0:
            dqn_agent.replay()
            dqn_agent.update_target_model()
        cur_state_ = observation
        if done:
            break
    if (episode + 1) % save_episodes == 0:
        dqn_agent.save("cartpole.h5")
        print(f"Avg game: {streak_step / save_episodes}, Epsilon: {dqn_agent.epsilon}")
        streak_step = 0

  1%|▌                                                     | 10/1000 [00:04<06:43,  2.45it/s]

Avg game: 9.6, Epsilon: 0.1


  2%|█                                                     | 20/1000 [00:24<09:46,  1.67it/s]

Avg game: 9.6, Epsilon: 0.099


  3%|█▌                                                    | 30/1000 [00:44<10:21,  1.56it/s]

Avg game: 9.9, Epsilon: 0.09801


  4%|██▏                                                   | 40/1000 [01:04<11:08,  1.44it/s]

Avg game: 10.2, Epsilon: 0.0970299


  5%|██▋                                                   | 50/1000 [01:23<10:43,  1.48it/s]

Avg game: 10.4, Epsilon: 0.096059601


  6%|███▏                                                  | 60/1000 [01:43<10:10,  1.54it/s]

Avg game: 10.0, Epsilon: 0.09509900499


  7%|███▊                                                  | 70/1000 [02:02<09:47,  1.58it/s]

Avg game: 10.0, Epsilon: 0.0941480149401


  8%|████▎                                                 | 80/1000 [02:20<08:35,  1.78it/s]

Avg game: 9.1, Epsilon: 0.093206534790699


  9%|████▊                                                 | 90/1000 [02:38<09:55,  1.53it/s]

Avg game: 9.9, Epsilon: 0.09227446944279201


 10%|█████▎                                               | 100/1000 [02:56<09:10,  1.63it/s]

Avg game: 9.7, Epsilon: 0.09135172474836409


 11%|█████▊                                               | 110/1000 [03:15<09:53,  1.50it/s]

Avg game: 9.8, Epsilon: 0.09043820750088044


 12%|██████▎                                              | 120/1000 [03:33<10:20,  1.42it/s]

Avg game: 10.1, Epsilon: 0.08953382542587164


 13%|██████▉                                              | 130/1000 [03:53<12:28,  1.16it/s]

Avg game: 9.5, Epsilon: 0.08863848717161292


 14%|███████▍                                             | 140/1000 [04:12<09:55,  1.45it/s]

Avg game: 10.3, Epsilon: 0.08775210229989679


 15%|███████▉                                             | 150/1000 [04:30<08:59,  1.58it/s]

Avg game: 9.8, Epsilon: 0.08687458127689782


 16%|████████▍                                            | 160/1000 [04:49<11:29,  1.22it/s]

Avg game: 9.2, Epsilon: 0.08600583546412884


 17%|█████████                                            | 170/1000 [05:07<10:51,  1.27it/s]

Avg game: 9.9, Epsilon: 0.08514577710948755


 18%|█████████▌                                           | 180/1000 [05:26<10:02,  1.36it/s]

Avg game: 9.7, Epsilon: 0.08429431933839267


 19%|██████████                                           | 190/1000 [05:44<12:50,  1.05it/s]

Avg game: 9.8, Epsilon: 0.08345137614500873


 20%|██████████▌                                          | 200/1000 [06:03<12:32,  1.06it/s]

Avg game: 10.4, Epsilon: 0.08261686238355864


 21%|███████████▏                                         | 210/1000 [06:21<10:32,  1.25it/s]

Avg game: 10.1, Epsilon: 0.08179069375972306


 22%|███████████▋                                         | 220/1000 [06:40<10:43,  1.21it/s]

Avg game: 9.8, Epsilon: 0.08097278682212583


 23%|████████████▏                                        | 230/1000 [07:00<13:02,  1.02s/it]

Avg game: 10.2, Epsilon: 0.08016305895390458


 24%|████████████▋                                        | 240/1000 [07:23<12:53,  1.02s/it]

Avg game: 10.2, Epsilon: 0.07936142836436554


 24%|████████████▊                                        | 242/1000 [07:24<09:51,  1.28it/s]

In [None]:
# Doesn't render in jupyter
# obs = env.reset()
# done = False
# while not done:
#     action = dqn_agent.act(obs)
#     obs, rewards, done, _, _ = env.step(action)
#     env.render()
# env.close()