In [2]:
import gymnasium as gym
import numpy as np
from datetime import datetime, timedelta
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
class Config():
    def __init__(self):
        # env
        self.env_name = "CartPole-v1"
        self.gamma = 0.99
        self.num_action = 2
        self.state_dim = 4

        # replay
        self.buffer_size = 100000
        self.batchsize = 64

        # training
        self.total_episodes = 500000
        self.total_steps = 500000
        self.learning_rate = 2.3e-3
        self.weight_decay = 1e-4
        self.start_training_step = 1000
        self.train_frequency = 256
        self.epochs = 128
        self.test_frequency = 10000
        self.save_frequency = 50000
        self.save_path = 'best_model.pth'
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        ## target network
        self.use_soft_update = True
        self.update_frequency = 10
        self.tau = 0.005

        ## episode
        self.init_epsilon = 1.
        self.end_epsilon = 0.04
        self.exploration_fraction = 0.16
        self.decay_step = self.total_episodes * self.exploration_fraction

config = Config()

In [4]:
class Replay_Buffer():
    def __init__(self, buffer_size, state_dim):
        self.buffer_size = buffer_size
        self.real_size = 0
        self.index = 0

        self.states = np.zeros((buffer_size, state_dim))
        self.actions = np.zeros((buffer_size,))
        self.rewards = np.zeros((buffer_size,))
        self.dones = np.zeros((buffer_size,), dtype = bool)
        self.next_states = np.zeros((buffer_size, state_dim))

    def add(self, state, action, reward, next_state, done):
        self.states[self.index] = state
        self.actions[self.index] = action
        self.rewards[self.index] = reward
        self.next_states[self.index] = next_state
        self.dones[self.index] = done

        self.real_size = min(self.real_size+1, self.buffer_size)
        self.index = (self.index+1) % self.buffer_size

    def sample(self, batchsize):
        idxs = np.random.choice(self.real_size, batchsize, replace=False).astype(np.int64)
        return torch.tensor(self.states[idxs]).float(), \
                torch.tensor(self.actions[idxs]).long().reshape(-1), \
                    torch.tensor(self.rewards[idxs]).reshape(-1), \
                        torch.tensor(self.next_states[idxs]).float(), torch.tensor(self.dones[idxs].astype(np.float32)).reshape(-1)


In [None]:
class Model(nn.Module):
    def __init__(self, state_dim=4, num_action=2):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.V = nn.Linear(64, 1)
        self.A = nn.Linear(64, num_action)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        A = self.A(x)
        V = self.V(x)
        Q = V + A - A.mean(-1, keepdims=True)
        return Q

In [6]:
def update_weights(model, target_model, use_soft_update, tau):
    if use_soft_update:
        with torch.no_grad():
            for target_param, online_param in zip(target_model.parameters(), model.parameters()):
                target_param.data.copy_(tau * online_param.data + (1.0 - tau) * target_param.data)
    else:
        with torch.no_grad():
            for target_param, online_param in zip(target_model.parameters(), model.parameters()):
                target_param.data.copy_(online_param.data)

In [None]:
def select_action(state, model, epsilon, num_action, config):
    state = torch.tensor(np.array(state)).unsqueeze(0).to(config.device)
    if np.random.rand() > epsilon:
        with torch.no_grad():
            Qs = model(state)[0]
            action = Qs.argmax().item()
    else:
        action = np.random.choice(num_action, 1)[0]
    return action

In [8]:
def decay_epsilon(step, config):
    if step < config.decay_step:
        epsilon = config.init_epsilon - (step / config.decay_step) * (config.init_epsilon - config.end_epsilon)
    else:
        epsilon = config.end_epsilon
    return epsilon

In [None]:
def train(model, target_model, replay_buffer, batchsize, gamma, optimizer):
    optimizer.zero_grad()
    states, actions, rewards, next_states, dones = replay_buffer.sample(batchsize)
    idxs = torch.arange(0, batchsize)
    with torch.no_grad():
        next_Qs = target_model(next_states.to(config.device))
        next_actions = next_Qs.argmax(-1)
        next_Qs = next_Qs[idxs, next_actions].reshape(-1)
    targets = rewards.to(config.device) + gamma * (1 - dones.to(config.device)) * next_Qs.to(config.device)
    Qs = model(states.to(config.device))[idxs, actions].reshape(-1)
    loss = ((targets - Qs)**2).mean()
    loss.backward()
    optimizer.step()
    return loss.item()

    
def test(config, model, max_test_rewards):
    env = gym.make(config.env_name, render_mode="rgb_array")
    state, info = env.reset()
    total_rewards = 0.
    step = 0.
    start_time = datetime.now()
    done = False
    while not done:
        action = select_action(state, model, -1, config.num_action, config)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_rewards += reward
        state = next_state
        step += 1
    if total_rewards > max_test_rewards:
        max_test_rewards = total_rewards
        torch.save(model.state_dict(), config.save_path)
    print(f"Test Episode: Steps: {step} Rewards: {total_rewards}, Max_Test_Rewards: {max_test_rewards}, Duration: {datetime.now() - start_time}")
    return max_test_rewards, total_rewards, step

In [None]:
replay_buffer = Replay_Buffer(config.buffer_size, config.state_dim)
model = Model(config.state_dim).to(config.device)
target_model = Model(config.state_dim).to(config.device)
update_weights(model, target_model, config.use_soft_update, config.tau)
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
step = 0
env = gym.make(config.env_name, render_mode="rgb_array")
epsilon = config.init_epsilon
max_test_rewards = 0.
episode_rewards = []
episode_steps = []
max_episode_rewards = 0.
test_episode_rewards = []
test_episode_steps = []
losses = []

for episode in range(config.total_episodes):
    if config.total_steps is not None and step > config.total_steps:
        break
    start_time = datetime.now()
    state, info = env.reset()
    done = False
    total_rewards = 0.
    episode_step = 0.
    while not done:
        step += 1
        episode_step += 1
        action = select_action(state, model, epsilon, config.num_action, config)
        next_state, reward, terminated, truncated, info = env.step(action)
        total_rewards += reward
        done = terminated or truncated
        replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        if step % config.train_frequency == 0 and step >= config.start_training_step:
            for _ in range(config.epochs):
                loss = train(model, target_model, replay_buffer, config.batchsize, config.gamma, optimizer)
                losses.append(loss)
        if step % config.test_frequency == 0:
            max_test_rewards, test_rewards, test_steps = test(config, model, max_test_rewards)
            test_episode_rewards.append(test_rewards)
            test_episode_steps.append(test_steps)
        if step % config.save_frequency == 0:
            torch.save(model.state_dict(), f"{step}.pth")
        epsilon = decay_epsilon(step, config)

        if step % config.update_frequency or config.use_soft_update:
            update_weights(model, target_model, config.use_soft_update, config.tau)
    episode_rewards.append(total_rewards)
    episode_steps.append(episode_step)
    max_episode_rewards = max(max_episode_rewards, total_rewards)
    if episode % 100 == 0:
        print(f"Step {step}, Episode {episode}: Steps: {episode_step}, Rewards: {total_rewards}, Mean_Rewards: {np.array(episode_rewards[-min(100, len(episode_rewards)):]).mean():.4f}, Max_Rewards: {max_episode_rewards}, Loss: {np.array(losses[-min(1000, len(losses)):]).mean()}, Duration: {datetime.now() - start_time}, epsilon: {epsilon:.6f}")

In [None]:
import pickle 
with open("log.pickle", "wb") as f:
    pickle.dump({"rewards": np.array(episode_rewards),
                 "steps": np.array(episode_steps),
                 "test rewards": np.array(test_episode_rewards),
                 "test steps": np.array(test_episode_steps),
                 "losses": np.array(losses)}, f)

In [None]:
import matplotlib.pyplot as plt

test_steps = np.arange(config.test_frequency, config.total_steps+1, config.test_frequency)
train_step = 0
train_steps = []
for steps in episode_steps:
    train_step += steps
    train_steps.append(train_step)
print(len(test_steps), len(test_episode_rewards))

mean_rewards = []
mean_rewards_last10 = []
mean_test_rewards = []
for i in range(len(episode_rewards)):
    mean_rewards.append(np.array(episode_rewards)[max(0, i-100):i+1].mean())
    mean_rewards_last10.append(np.array(episode_rewards)[max(0, i-10):i+1].mean())
for i in range(len(test_episode_rewards)):
    mean_test_rewards.append(np.array(test_episode_rewards)[max(0, i-10):i+1].mean())

plt.figure(figsize=(10, 5))
# plt.plot(episode_steps, episode_rewards, label='Train Reward', alpha=0.6)
# plt.plot(test_steps, test_episode_rewards, label='Test Reward', color='red', linewidth=2)
# plt.plot(train_steps, mean_rewards_last10, label='Mean Rewards (Last 10)')#, color = 'green')
plt.plot(train_steps, mean_rewards, label='Mean Rewards (Last 100)')#, linestyle='--', color = 'orange')
plt.plot(test_steps, mean_test_rewards, label='Mean Test Rewards (Last 10)')#, linestyle='--', color='red')
plt.scatter(test_steps, test_episode_rewards, label='Test Rewards', alpha=0.6, s=20, color = 'green')#, linestyle='--', color='red')
plt.xlabel('Step')
plt.ylabel('Reward')
plt.title('Train vs Test Rewards over Steps')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
# plt.plot(episode_steps, episode_rewards, label='Train Reward', alpha=0.6)
# plt.plot(test_steps, test_episode_rewards, label='Test Reward', color='red', linewidth=2)
# plt.plot(train_steps, mean_rewards_last10, label='Mean Rewards (Last 10)')#, color = 'green')
plt.plot(train_steps, mean_rewards, label='Mean Rewards (Last 100)')#, linestyle='--', color = 'orange')
plt.plot(test_steps, mean_test_rewards, label='Mean Test Rewards (Last 10)')#, linestyle='--', color='red')
#plt.plot(test_steps, test_episode_rewards, label='Test Rewards', alpha=0.6)#, linestyle='--', color='red')
plt.xlabel('Step')
plt.ylabel('Reward')
plt.title('Train vs Test Rewards over Steps')
plt.legend()
plt.grid(True)
plt.show()