In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time
import os

data = pd.read_csv('D:\\dados\\bar_M15_V03_data_01-01-2023_a_31-08-2024.csv')
data['DateTime'] = pd.to_datetime(data['DateTime'])

# Criar a coluna "Valor", que é uma cópia de "Close" e não será normalizada
data['Valor'] = data['Close']

# Normalizar as colunas necessárias (exceto "Valor" e "Gatilho")
scaler = MinMaxScaler()
cols_to_normalize = [
    'Open', 'High', 'Low', 'Close', 'Volume', 'PavioSuperior', 'PavioInferior',
    'Corpo', 'Range','SMA4','SMA8','SMA12','SMA20', 'SMA50', 'SMA100', 'SMA200', 'StochasticoK',
    'StochasticoD', 'RSI', 'MACD', 'MACDSignal', 'MACDHistogram','atr8','atr14','atr28','dayO','dayH','dayL'
]
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

# Converter todos os valores para tipo float32 para evitar problemas de tipo
data = data.astype({col: 'float32' for col in cols_to_normalize + ['Valor']})


In [2]:
import gym
from gym import spaces

class TradingEnv(gym.Env):
    def __init__(self, data):
        super(TradingEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.current_step = 0
        self.position = 0  # 0 = neutro, 1 = comprado, -1 = vendido
        self.entry_price = 0.0
        self.entry_step = None
        self.entry_datetime = None
        self.action_space = spaces.Discrete(3)  # 0 = Manter, 1 = Comprar, 2 = Vender
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(len(data.columns) - 3 + 1,), dtype=np.float32
        )
        self.trades = []

    def reset(self):
        self.current_step = 0
        self.position = 0
        self.entry_price = 0.0
        self.entry_step = None
        self.entry_datetime = None
        self.trades = []
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].drop(['Valor', 'DateTime', 'Gatilho']).values
        obs = np.append(obs, self.position)
        return obs.astype(np.float32)

    def step(self, action):
        done = self.current_step >= len(self.data) - 2
        reward = 0
        info = {}

        # Obter o valor atual e o próximo valor
        current_price = self.data['Valor'].iloc[self.current_step]
        next_price = self.data['Valor'].iloc[self.current_step + 1]

        # Obter o valor do gatilho no passo atual
        gatilho = int(self.data['Gatilho'].iloc[self.current_step])

        # Se o gatilho estiver ativo, o agente pode executar todas as ações
        if gatilho == 1:
            if action == 1:  # Comprar
                if self.position == 0:
                    self.position = 1  # Abrir posição comprada
                    self.entry_price = current_price
                    self.entry_step = self.current_step
                    self.entry_datetime = self.data['DateTime'].iloc[self.current_step]
                    reward -= 0.25  # Custo de operação
                    info['trade'] = {
                        'type': 'buy',
                        'entry_step': self.entry_step,
                        'entry_price': self.entry_price,
                        'entry_datetime': self.entry_datetime
                    }
                elif self.position == -1:
                    # Fechar posição vendida
                    self.exit_price = current_price
                    profit = self.entry_price - self.exit_price - 0.25
                    self.exit_step = self.current_step
                    self.exit_datetime = self.data['DateTime'].iloc[self.current_step]
                    reward += profit
                    info['trade'] = {
                        'type': 'close_short',
                        'exit_step': self.exit_step,
                        'exit_price': self.exit_price,
                        'exit_datetime': self.exit_datetime,
                        'profit': profit
                    }
                    self.trades.append({
                        'type': 'sell',
                        'entry_step': self.entry_step,
                        'entry_price': self.entry_price,
                        'entry_datetime': self.entry_datetime,
                        'exit_step': self.exit_step,
                        'exit_price': self.exit_price,
                        'exit_datetime': self.exit_datetime,
                        'profit': profit
                    })
                    self.position = 0
                    self.entry_step = None
                    self.entry_datetime = None
            elif action == 2:  # Vender
                if self.position == 0:
                    self.position = -1  # Abrir posição vendida
                    self.entry_price = current_price
                    self.entry_step = self.current_step
                    self.entry_datetime = self.data['DateTime'].iloc[self.current_step]
                    reward -= 0.25
                    info['trade'] = {
                        'type': 'sell',
                        'entry_step': self.entry_step,
                        'entry_price': self.entry_price,
                        'entry_datetime': self.entry_datetime
                    }
                elif self.position == 1:
                    # Fechar posição comprada
                    self.exit_price = current_price
                    profit = self.exit_price - self.entry_price - 0.25
                    self.exit_step = self.current_step
                    self.exit_datetime = self.data['DateTime'].iloc[self.current_step]
                    reward += profit
                    info['trade'] = {
                        'type': 'close_long',
                        'exit_step': self.exit_step,
                        'exit_price': self.exit_price,
                        'exit_datetime': self.exit_datetime,
                        'profit': profit
                    }
                    self.trades.append({
                        'type': 'buy',
                        'entry_step': self.entry_step,
                        'entry_price': self.entry_price,
                        'entry_datetime': self.entry_datetime,
                        'exit_step': self.exit_step,
                        'exit_price': self.exit_price,
                        'exit_datetime': self.exit_datetime,
                        'profit': profit
                    })
                    self.position = 0
                    self.entry_step = None
                    self.entry_datetime = None
            else:  # Manter
                pass
        else:
            # Fechar posição se o gatilho não estiver ativo
            if self.position == 1:
                self.exit_price = current_price
                profit = self.exit_price - self.entry_price - 0.25
                self.exit_step = self.current_step
                self.exit_datetime = self.data['DateTime'].iloc[self.current_step]
                reward += profit
                info['trade'] = {
                    'type': 'close_long',
                    'exit_step': self.exit_step,
                    'exit_price': self.exit_price,
                    'exit_datetime': self.exit_datetime,
                    'profit': profit
                }
                self.trades.append({
                    'type': 'buy',
                    'entry_step': self.entry_step,
                    'entry_price': self.entry_price,
                    'entry_datetime': self.entry_datetime,
                    'exit_step': self.exit_step,
                    'exit_price': self.exit_price,
                    'exit_datetime': self.exit_datetime,
                    'profit': profit
                })
                self.position = 0
                self.entry_step = None
                self.entry_datetime = None
            elif self.position == -1:
                self.exit_price = current_price
                profit = self.entry_price - self.exit_price - 0.25
                self.exit_step = self.current_step
                self.exit_datetime = self.data['DateTime'].iloc[self.current_step]
                reward += profit
                info['trade'] = {
                    'type': 'close_short',
                    'exit_step': self.exit_step,
                    'exit_price': self.exit_price,
                    'exit_datetime': self.exit_datetime,
                    'profit': profit
                }
                self.trades.append({
                    'type': 'sell',
                    'entry_step': self.entry_step,
                    'entry_price': self.entry_price,
                    'entry_datetime': self.entry_datetime,
                    'exit_step': self.exit_step,
                    'exit_price': self.exit_price,
                    'exit_datetime': self.exit_datetime,
                    'profit': profit
                })
                self.position = 0
                self.entry_step = None
                self.entry_datetime = None

        self.current_step += 1

        obs = self._next_observation()
        return obs, reward, done, info


## Bloco 3: Criar o Agente Rainbow DQN usando PyTorch

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import collections
import random
import math
import numpy as np

# Configurações do dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Implementação da Camada Noisy Linear
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, sigma_init=0.017):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.sigma_init = sigma_init

        self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.zeros(out_features, in_features))

        self.bias_mu = nn.Parameter(torch.empty(out_features))
        self.bias_sigma = nn.Parameter(torch.empty(out_features))
        self.register_buffer('bias_epsilon', torch.zeros(out_features))

        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / math.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.in_features))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.out_features))

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        self.weight_epsilon.copy_(epsilon_out.outer(epsilon_in))
        self.bias_epsilon.copy_(epsilon_out)

    def forward(self, x):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return nn.functional.linear(x, weight, bias)

    def _scale_noise(self, size):
        x = torch.randn(size)
        return x.sign().mul_(x.abs().sqrt())

# Implementação da Rede Rainbow DQN
class RainbowDQN(nn.Module):
    def __init__(self, obs_size, n_actions):
        super(RainbowDQN, self).__init__()
        self.feature = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU()
        )
        # Advantage stream
        self.advantage = nn.Sequential(
            NoisyLinear(128, 128),
            nn.ReLU(),
            NoisyLinear(128, n_actions)
        )
        # Value stream
        self.value = nn.Sequential(
            NoisyLinear(128, 128),
            nn.ReLU(),
            NoisyLinear(128, 1)
        )

    def forward(self, x):
        x = self.feature(x)
        advantage = self.advantage(x)
        value = self.value(x)
        # Dueling Q-values
        q_values = value + advantage - advantage.mean()
        return q_values

    def reset_noise(self):
        for module in self.modules():
            if isinstance(module, NoisyLinear):
                module.reset_noise()

# Implementação do Prioritized Replay Buffer
class PrioritizedReplayBuffer(object):
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.alpha = alpha

        self.buffer = []
        self.pos = 0

        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def push(self, *args):
        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((*args,))
        else:
            self.buffer[self.pos] = (*args,)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs = prios ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)

        batch = list(zip(*samples))

        states = torch.cat(batch[0]).to(device)
        actions = torch.tensor(batch[1], dtype=torch.long, device=device).unsqueeze(1)
        rewards = torch.tensor(batch[2], dtype=torch.float32, device=device).unsqueeze(1)
        next_states = torch.cat(batch[3]).to(device)
        dones = torch.tensor(batch[4], dtype=torch.float32, device=device).unsqueeze(1)
        weights = torch.tensor(weights, dtype=torch.float32, device=device).unsqueeze(1)

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio


## Bloco 4: Treinamento do Agente Rainbow DQN

In [4]:
# Configurações do treinamento
num_episodes = 100
gamma = 0.99
batch_size = 256
learning_rate = 5e-4
memory_size = 10000
target_update = 1000
beta_start = 0.4
beta_frames = num_episodes * len(data)
alpha = 0.6

# Criar o ambiente
env = TradingEnv(data)

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# Instanciar a rede
q_net = RainbowDQN(obs_size, n_actions).to(device)
target_net = RainbowDQN(obs_size, n_actions).to(device)
target_net.load_state_dict(q_net.state_dict())

# Definir o otimizador
optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)

# Inicializar o Prioritized Replay Buffer
replay_buffer = PrioritizedReplayBuffer(memory_size, alpha=alpha)

# Inicializar a lista de melhores episódios
best_episodes = []

# Função para selecionar ação usando Noisy Nets
def select_action(state):
    with torch.no_grad():
        q_values = q_net(state)
        return q_values.argmax().item()

save_dir = "4.18.3.1"
os.makedirs(save_dir, exist_ok=True)

beta = beta_start
frame_idx = 0  # Contador de frames para ajustar beta
for episode in range(num_episodes):
    start_time = time.time()
    obs = env.reset()
    obs = torch.FloatTensor(obs).unsqueeze(0).to(device)
    done = False
    total_reward = 0
    steps = 0
    actions_count = {0: 0, 1: 0, 2: 0}
    wins = 0
    losses = 0
    win_total = 0
    lose_total = 0
    trades = []
    current_trade = None

    while not done:
        steps += 1
        frame_idx += 1

        # Selecionar ação
        action = select_action(obs)

        # Executar ação no ambiente
        obs_next, reward, done, info = env.step(action)
        obs_next = torch.FloatTensor(obs_next).unsqueeze(0).to(device)

        # Armazenar na memória de replay
        replay_buffer.push(obs, action, reward, obs_next, done)

        # Atualizar o estado
        obs = obs_next

        # Atualizar contagem de ações
        actions_count[action] += 1

        # Atualizar recompensa total
        total_reward += reward

        # Resetar ruído das Noisy Nets
        q_net.reset_noise()
        target_net.reset_noise()

        # Processar informações de trade
        if 'trade' in info:
            trade_info = info['trade']
            if trade_info['type'] in ['buy', 'sell']:
                current_trade = {
                    'type': trade_info['type'],
                    'entry_step': trade_info['entry_step'],
                    'entry_price': trade_info['entry_price'],
                    'entry_datetime': trade_info['entry_datetime'],
                    'exit_step': None,
                    'exit_price': None,
                    'exit_datetime': None,
                    'profit': None
                }
            elif trade_info['type'] in ['close_long', 'close_short']:
                current_trade['exit_step'] = trade_info['exit_step']
                current_trade['exit_price'] = trade_info['exit_price']
                current_trade['exit_datetime'] = trade_info['exit_datetime']
                current_trade['profit'] = trade_info['profit']
                trades.append(current_trade.copy())
                if current_trade['profit'] > 0:
                    wins += 1
                    win_total += current_trade['profit']
                elif current_trade['profit'] < 0:
                    losses += 1
                    lose_total += current_trade['profit']
                current_trade = None

        # Treinar a rede se a memória tiver tamanho suficiente
        if len(replay_buffer.buffer) >= batch_size:
            beta = min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)
            states, actions_batch, rewards_batch, next_states, dones, indices, weights = replay_buffer.sample(batch_size, beta)

            # Computar Q-valor atual
            q_values = q_net(states).gather(1, actions_batch)

            # Computar Q-valor alvo usando Double DQN
            with torch.no_grad():
                next_actions = q_net(next_states).argmax(1, keepdim=True)
                next_q_values = target_net(next_states).gather(1, next_actions)
                target_q_values = rewards_batch + gamma * next_q_values * (1 - dones)

            # Calcular o erro para Prioritized Replay
            td_errors = (q_values - target_q_values).detach().cpu().numpy().flatten()
            new_priorities = np.abs(td_errors) + 1e-6
            replay_buffer.update_priorities(indices, new_priorities)

            # Calcular a perda ponderada
            loss = (weights * nn.functional.mse_loss(q_values, target_q_values, reduction='none')).mean()

            # Otimizar a rede
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Atualizar a rede alvo
            if frame_idx % target_update == 0:
                target_net.load_state_dict(q_net.state_dict())

    # Cálculo do tempo de treinamento do episódio
    end_time = time.time()
    episode_time = end_time - start_time

    win_rate = wins / (wins + losses) if (wins + losses) > 0 else 0
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward:.2f}, Win Rate: {win_rate:.2f}, "
          f"Wins: {wins}, Losses: {losses}, Steps: {steps}, Time: {episode_time:.2f}s")
    print(f"Ações: Manter={actions_count[0]}, Comprar={actions_count[1]}, Vender={actions_count[2]}")
    print(f"Ganhos Totais: {win_total:.2f}, Perdas Totais: {lose_total:.2f}")

    # Salvar informações do episódio
    episode_info = {
        'episode': episode + 1,
        'total_reward': total_reward,
        'win_rate': win_rate,
        'wins': wins,
        'losses': losses,
        'actions_count': actions_count.copy(),
        'win_total': win_total,
        'lose_total': lose_total,
        'steps': steps,
        'episode_time': episode_time,
        'model_state_dict': q_net.state_dict(),
        'trades': trades.copy()
    }

    # Adicionar o episódio à lista dos melhores e manter os top 10
    best_episodes.append(episode_info)
    best_episodes = sorted(best_episodes, key=lambda x: x['total_reward'], reverse=True)[:10]

    # Salvar o modelo e log do episódio se for um dos top 10
    if episode_info in best_episodes:
        model_path = os.path.join(save_dir, f"model_episode_{episode_info['episode']}.pth")
        torch.save(episode_info['model_state_dict'], model_path)
        episode_info['model_path'] = model_path

        # Salvar o log completo das operações
        log_path = os.path.join(save_dir, f"log_episode_{episode_info['episode']}.csv")
        trades_df = pd.DataFrame(episode_info['trades'])
        trades_df.to_csv(log_path, index=False)
        episode_info['log_path'] = log_path

        print(f"Modelo e log do episódio {episode_info['episode']} salvos em: {model_path} e {log_path}\n")

# Exibir os top 10 episódios ao final do treinamento
print("\nTreinamento finalizado.")
print("Top 10 Melhores Episódios:")
for idx, ep in enumerate(best_episodes, 1):
    print(f"Rank {idx}: Episode {ep['episode']}, Total Reward: {ep['total_reward']:.2f}, "
          f"Win Rate: {ep['win_rate']:.2f}, Wins: {ep['wins']}, Losses: {ep['losses']}, "
          f"Ações: {ep['actions_count']}, Steps: {ep['steps']}, Time: {ep['episode_time']:.2f}s")


Episode 1/100, Total Reward: -519.25, Win Rate: 0.50, Wins: 679, Losses: 677, Steps: 36754, Time: 365.83s
Ações: Manter=22467, Comprar=7943, Vender=6344
Ganhos Totais: 20170.00, Perdas Totais: -20349.25
Modelo e log do episódio 1 salvos em: 4.18.3.1\model_episode_1.pth e 4.18.3.1\log_episode_1.csv

Episode 2/100, Total Reward: -291.25, Win Rate: 0.45, Wins: 286, Losses: 345, Steps: 36754, Time: 394.33s
Ações: Manter=26202, Comprar=4110, Vender=6442
Ganhos Totais: 14333.75, Perdas Totais: -14465.50
Modelo e log do episódio 2 salvos em: 4.18.3.1\model_episode_2.pth e 4.18.3.1\log_episode_2.csv

Episode 3/100, Total Reward: -919.00, Win Rate: 0.48, Wins: 347, Losses: 372, Steps: 36754, Time: 395.97s
Ações: Manter=27863, Comprar=3049, Vender=5842
Ganhos Totais: 14505.00, Perdas Totais: -15243.75
Modelo e log do episódio 3 salvos em: 4.18.3.1\model_episode_3.pth e 4.18.3.1\log_episode_3.csv

Episode 4/100, Total Reward: -2070.50, Win Rate: 0.46, Wins: 345, Losses: 404, Steps: 36754, Time: 3