In [10]:
from torch import nn
import torch
import gym
from collections import deque
import random
import numpy as np
import itertools

In [11]:
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
TARGET_UPDATE_FREQ = 1000

In [12]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.net = nn.Sequential(
            nn.Linear(in_features, 65),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )

    def forward(self, x):
        return self.net(x)
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self(obs_t.insqueeze(0))

        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.detach().item()

        return action

env = gym.make("CartPole-v1", render_mode="human") # Criando um ambiente do CartPole-v1 e

replay_buffer = deque(maxlen=BUFFER_SIZE)
rew_buffer = deque([0,0], maxlen=100)

episode_reward = 0.0

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())



<All keys matched successfully>

In [13]:
# Inicializa o Replay Buffer
obs = env.reset()
for i in range(MIN_REPLAY_SIZE):
    action = env.action_space.sample()
    next_obs, reward, done, truncated, info = env.step(action)

    replay_buffer.append((obs, action, reward, next_obs, done))

    obs = next_obs

    if done:
        obs = env.reset()

# Treinamento
obs = env.reset()

for step in itertools.count():
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END]) # Calcula o valor de epsilon para o passo atual

    rnd_sample = random.random() # Gera um número aleatório entre 0 e 1

    if rnd_sample < epsilon:
        action = env.action_space.sample()
    else:
        action = online_net.act(obs) # Escolhe uma ação com base na observação atual

    next_obs, reward, done, truncated, info = env.step(action)

    replay_buffer.append((obs, action, reward, next_obs, done))

    obs = next_obs

    episode_reward += reward

    if done:
        obs = env.reset()

        rew_buffer.append(episode_reward)
        episode_reward = 0.0

    # gradient
    transitions = random.sample(replay_buffer, BATCH_SIZE)

    obses = np.asarray([t[0] for t in transitions])
    actions = np.asarray([t[1] for t in transitions])
    rews = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[4] for t in transitions])
    new_obses = np.asarray([t[3] for t in transitions])

    obses_t = torch.as_tensor(obses, dtype=torch.float32)
    actions_t = torch.as_tensor(actions, dtype=torch.int64)
    rews_t = torch.as_tensor(rews, dtype=torch.float32)
    dones_t = torch.as_tensor(dones, dtype=torch.float32)
    new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)

    target_q_values = target_net(new_obses_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]

    targets = rews_t + GAMMA * max_target_q_values * (1 - dones_t)

    q_values = online_net(obses_t)

    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t.unsqueeze(-1)).squeeze(-1)


KeyboardInterrupt: 