In [1]:
#!pip install gymnasium

In [3]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm

import gymnasium as gym

In [4]:
# Agent
from keras.models import Model, Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam
from collections import deque
import gc

print("Change back to 1.0 epsilon")
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.tau = 0.5
        self.batch_size = 100
        self.memory = deque(maxlen=10000)
        self.gamma = 0.99
        self.epsilon = 0.1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.double_dqn = True

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        
        # model.add(Dense(128, input_dim = self.state_size, activation='relu'))
        # model.add(Dense(64, activation='relu'))
        # model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='relu'))
        model.compile(loss='mse',optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # if self.epsilon > self.epsilon_min:
        #     self.epsilon *= self.epsilon_decay
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def replay(self):
        minibatch = self.sample(self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state, verbose=0)
            if not done:
                # if self.double_dqn:
                    predicted_action = np.argmax(target[0])#self.model.predict(next_state,verbose=0)[0])
                    target_q = self.target_model.predict(next_state, verbose=0)[0][predicted_action]
                    target[0][action] = reward + self.gamma * target_q
                # else:
                #     target_q = self.target_model.predict(next_state, verbose=0)[0]
                #     target[0][action] = reward + self.gamma * max(target_q)
            else:
                target[0][action] = reward
            self.model.fit(state,target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        gc.collect()

    def update_target_model(self):
        #TODO: Try training without a target network
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1- self.tau)
        self.target_model.set_weights(target_weights)

    def save(self, file):
        self.model.save_weights(file)

    def load(self, file):
        self.model.load_weights(file)
        self.target_model.load_weights(file)
            

Change back to 1.0 epsilon


In [5]:
# Train
env = gym.make("CartPole-v1")
state_size_, action_size_ = 4, 2
dqn_agent = DDQNAgent(state_size_, action_size_)
try:
    dqn_agent.load('cartpole.h5')
except:
    print("No previous model detected")
n_episodes = 1000
n_steps = 200
# capacity = 10000

streak_step = 0
total_step = 0

save_episodes = 10
for episode in tqdm(range(n_episodes)):
    cur_state_, _ = env.reset()
    for step in range(n_steps):
        streak_step += 1
        total_step += 1
        cur_state_ = np.reshape(cur_state_, [1, state_size_])
        action = dqn_agent.act(cur_state_)
        observation, reward, done, _, _ = env.step(action)
        observation = np.reshape(observation, [1, state_size_])
        if done:
            reward = -5
        dqn_agent.remember(cur_state_, action, reward, observation, done)
        if total_step % dqn_agent.batch_size == 0:
            dqn_agent.replay()
            dqn_agent.update_target_model()
        cur_state_ = observation
        if done:
            break
    if (episode + 1) % save_episodes == 0:
        dqn_agent.save("cartpole.h5")
        print(f"Avg game: {streak_step / save_episodes}, Epsilon: {dqn_agent.epsilon}")
        streak_step = 0

  1%|▌                                                     | 10/1000 [00:04<07:02,  2.34it/s]

Avg game: 9.3, Epsilon: 0.1


  2%|█                                                     | 20/1000 [00:23<09:15,  1.76it/s]

Avg game: 9.8, Epsilon: 0.099


  3%|█▌                                                    | 30/1000 [00:41<09:03,  1.79it/s]

Avg game: 10.0, Epsilon: 0.09801


  4%|██▏                                                   | 40/1000 [00:59<08:29,  1.88it/s]

Avg game: 9.3, Epsilon: 0.0970299


  5%|██▋                                                   | 50/1000 [01:18<11:26,  1.38it/s]

Avg game: 9.8, Epsilon: 0.096059601


  6%|███▏                                                  | 60/1000 [01:38<11:08,  1.41it/s]

Avg game: 10.0, Epsilon: 0.09509900499


  7%|███▊                                                  | 70/1000 [01:57<10:41,  1.45it/s]

Avg game: 9.5, Epsilon: 0.0941480149401


  8%|████▎                                                 | 80/1000 [02:16<12:00,  1.28it/s]

Avg game: 10.4, Epsilon: 0.093206534790699


  9%|████▊                                                 | 90/1000 [02:35<10:45,  1.41it/s]

Avg game: 10.0, Epsilon: 0.09227446944279201


 10%|█████▎                                               | 100/1000 [02:57<11:52,  1.26it/s]

Avg game: 9.8, Epsilon: 0.09135172474836409


 11%|█████▊                                               | 110/1000 [03:17<12:35,  1.18it/s]

Avg game: 9.5, Epsilon: 0.09043820750088044


 12%|██████▎                                              | 120/1000 [03:36<11:19,  1.29it/s]

Avg game: 9.2, Epsilon: 0.08953382542587164


 13%|██████▉                                              | 130/1000 [03:54<13:40,  1.06it/s]

Avg game: 9.6, Epsilon: 0.08863848717161292


 14%|███████▍                                             | 140/1000 [04:13<13:36,  1.05it/s]

Avg game: 10.0, Epsilon: 0.08775210229989679


 15%|███████▉                                             | 150/1000 [04:33<14:28,  1.02s/it]

Avg game: 9.9, Epsilon: 0.08687458127689782


 16%|████████▍                                            | 160/1000 [04:52<16:43,  1.19s/it]

Avg game: 9.2, Epsilon: 0.08600583546412884


 17%|█████████                                            | 170/1000 [05:10<15:53,  1.15s/it]

Avg game: 9.5, Epsilon: 0.08514577710948755


 18%|█████████▌                                           | 180/1000 [05:29<16:37,  1.22s/it]

Avg game: 10.7, Epsilon: 0.08429431933839267


 19%|██████████                                           | 190/1000 [05:48<16:54,  1.25s/it]

Avg game: 9.9, Epsilon: 0.08345137614500873


 20%|██████████▌                                          | 200/1000 [06:09<17:33,  1.32s/it]

Avg game: 9.6, Epsilon: 0.08261686238355864


 21%|███████████▏                                         | 210/1000 [06:29<20:11,  1.53s/it]

Avg game: 9.3, Epsilon: 0.08179069375972306


 22%|███████████▋                                         | 220/1000 [06:48<27:02,  2.08s/it]

Avg game: 9.6, Epsilon: 0.08097278682212583


 23%|████████████▏                                        | 230/1000 [07:08<20:48,  1.62s/it]

Avg game: 9.9, Epsilon: 0.08016305895390458


 24%|████████████▌                                        | 236/1000 [07:13<23:23,  1.84s/it]


KeyboardInterrupt: 

In [None]:
# Doesn't render in jupyter
# obs = env.reset()
# done = False
# while not done:
#     action = dqn_agent.act(obs)
#     obs, rewards, done, _, _ = env.step(action)
#     env.render()
# env.close()

In [1]:
from torch import nn
from torch.nn import functional


class Model(nn.Module):
    def __init__(self, input_features, output_values):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_features, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=output_values)

    def forward(self, x):
        x = functional.selu(self.fc1(x))
        x = functional.selu(self.fc2(x))
        x = self.fc3(x)
        return x

In [3]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import gymnasium as gym
from collections import deque
import random

# Parameters
use_cuda = True
episode_limit = 100
target_update_delay = 2  # update target net every target_update_delay episodes
test_delay = 10
learning_rate = 1e-4
epsilon = 1  # initial epsilon
min_epsilon = 0.1
epsilon_decay = 0.9 / 2.5e3
gamma = 0.99
memory_len = 10000

env = gym.make('CartPole-v1')
n_features = len(env.observation_space.high)
n_actions = env.action_space.n

memory = deque(maxlen=memory_len)
# each memory entry is in form: (state, action, env_reward, next_state)
device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
policy_net = Model(n_features, n_actions).to(device)
target_net = Model(n_features, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()


def get_states_tensor(sample, states_idx):
    sample_len = len(sample)
    states_tensor = torch.empty((sample_len, n_features), dtype=torch.float32, requires_grad=False)

    features_range = range(n_features)
    for i in range(sample_len):
        for j in features_range:
            states_tensor[i, j] = sample[i][states_idx][j].item()

    return states_tensor


def normalize_state(state):
    # pass
    state[0] /= 2.5
    state[1] /= 2.5
    state[2] /= 0.3
    state[3] /= 0.3


def state_reward(state, env_reward):
    # return env_reward
    return env_reward - (abs(state[0]) + abs(state[2])) / 2.5


def get_action(state, e=min_epsilon):
    if random.random() < e:
        # explore
        action = random.randrange(0, n_actions)
    else:
        state = torch.tensor(state, dtype=torch.float32, device=device)
        action = policy_net(state).argmax().item()

    return action


def fit(model, inputs, labels):
    inputs = inputs.to(device)
    labels = labels.to(device)
    train_ds = TensorDataset(inputs, labels)
    train_dl = DataLoader(train_ds, batch_size=5)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.train()
    total_loss = 0.0

    for x, y in train_dl:
        out = model(x)
        loss = criterion(out, y)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    return total_loss / len(inputs)


def optimize_model(train_batch_size=100):
    train_batch_size = min(train_batch_size, len(memory))
    train_sample = random.sample(memory, train_batch_size)

    state = get_states_tensor(train_sample, 0)
    next_state = get_states_tensor(train_sample, 3)

    q_estimates = policy_net(state.to(device)).detach()
    next_state_q_estimates = target_net(next_state.to(device)).detach()
    next_actions = policy_net(next_state.to(device)).argmax(dim=1)

    for i in range(len(train_sample)):
        next_action = next_actions[i].item()
        q_estimates[i][train_sample[i][1]] = (state_reward(next_state[i], train_sample[i][2]) +
                                              gamma * next_state_q_estimates[i][next_action].item())

    fit(policy_net, state, q_estimates)


def train_one_episode():
    global epsilon
    current_state, _ = env.reset()
    normalize_state(current_state)
    done = False
    score = 0
    reward = 0
    while not done:
        action = get_action(current_state, epsilon)
        next_state, env_reward, done, _, _ = env.step(action)
        normalize_state(next_state)
        memory.append((current_state, action, env_reward, next_state))
        current_state = next_state
        score += env_reward
        reward += state_reward(next_state, env_reward)

        optimize_model(100)

        epsilon -= epsilon_decay

    return score, reward


def test():
    state, _ = env.reset()
    normalize_state(state)
    done = False
    score = 0
    reward = 0
    while not done:
        action = get_action(state)
        state, env_reward, done, _, _ = env.step(action)
        normalize_state(state)
        score += env_reward
        reward += state_reward(state, env_reward)

    return score, reward


def main():
    best_test_reward = 0

    for i in range(episode_limit):
        score, reward = train_one_episode()

        print(f'Episode {i + 1}: score: {score} - reward: {reward}')

        if i % target_update_delay == 0:
            target_net.load_state_dict(policy_net.state_dict())
            target_net.eval()

        if (i + 1) % test_delay == 0:
            test_score, test_reward = test()
            print(f'Test Episode {i + 1}: test score: {test_score} - test reward: {test_reward}')
            if test_reward > best_test_reward:
                print('New best test reward. Saving model')
                best_test_reward = test_reward
                torch.save(policy_net.state_dict(), 'policy_net.pth')

    if episode_limit % test_delay != 0:
        test_score, test_reward = test()
        print(f'Test Episode {episode_limit}: test score: {test_score} - test reward: {test_reward}')
        if test_reward > best_test_reward:
            print('New best test reward. Saving model')
            best_test_reward = test_reward
            torch.save(policy_net.state_dict(), 'policy_net.pth')

    print(f'best test reward: {best_test_reward}')


if __name__ == '__main__':
    main()

Episode 1: score: 13.0 - reward: 11.21826354041696
Episode 2: score: 14.0 - reward: 12.34218921512365
Episode 3: score: 15.0 - reward: 12.88738244101405
Episode 4: score: 11.0 - reward: 9.394015821814536
Episode 5: score: 27.0 - reward: 24.708146405220027
Episode 6: score: 15.0 - reward: 13.197613238915801
Episode 7: score: 14.0 - reward: 12.32804586738348
Episode 8: score: 67.0 - reward: 58.684392908215536
Episode 9: score: 10.0 - reward: 8.721277001313865
Episode 10: score: 27.0 - reward: 24.680841796845204
Test Episode 10: test score: 12.0 - test reward: 10.585097850859166
New best test reward. Saving model
Episode 11: score: 19.0 - reward: 16.828974480926988
Episode 12: score: 15.0 - reward: 12.62376758605242
Episode 13: score: 62.0 - reward: 53.331599742174156
Episode 14: score: 25.0 - reward: 21.99505076408386
Episode 15: score: 9.0 - reward: 7.689670917391777
Episode 16: score: 44.0 - reward: 39.324059683084485
Episode 17: score: 30.0 - reward: 26.901629936695095
Episode 18: sco