In [2]:
# Bloco 1: Preparar os Dados

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time

# Carregar o dataset
data = pd.read_csv('D:\\dados\\bar_M1_data_07-08-2024.csv')
data['DateTime'] = pd.to_datetime(data['DateTime'])

# Criar a coluna "Valor", que é uma cópia de "Close" e não será normalizada
data['Valor'] = data['Close']

# Normalizar as colunas necessárias (exceto "Valor" e "Gatilho")
scaler = MinMaxScaler()
cols_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume', 'PavioSuperior', 'PavioInferior', 'Corpo', 'Range', 'SMA50', 'SMA100', 'SMA200', 'StochasticoK', 'StochasticoD', 'RSI', 'MACD', 'MACDSignal', 'MACDHistogram']
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

# Converter todos os valores para tipo float32 para evitar problemas de tipo
data = data.astype({col: 'float32' for col in cols_to_normalize + ['Valor']})

# Bloco 2: Criar o Ambiente com Gerenciamento de Posição

import gym
from gym import spaces

class TradingEnv(gym.Env):
    def __init__(self, data):
        super(TradingEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.current_step = 0
        self.position = 0  # 0 = neutro, 1 = comprado, -1 = vendido
        self.action_space = spaces.Discrete(3)  # 0 = Manter, 1 = Comprar, 2 = Vender

        # Atualizar o tamanho da observação para incluir a posição atual
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(len(data.columns) - 3 + 1,), dtype=np.float32
        )

    def reset(self):
        self.current_step = 0
        self.position = 0
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].drop(['Valor', 'DateTime', 'Gatilho']).values
        obs = np.append(obs, self.position)  # Incluir a posição atual na observação
        return obs.astype(np.float32)

    def step(self, action):
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1
        reward = 0

        # Calcular a mudança de preço
        price_change = self.data['Valor'].iloc[self.current_step] - self.data['Valor'].iloc[self.current_step - 1]

        if action == 1:  # Comprar
            if self.position == 0:
                self.position = 1  # Abrir posição comprada
                reward -= 0.25  # Custo de operação
            elif self.position == -1:
                self.position = 0  # Fechar posição vendida
                reward += -price_change - 0.25  # Ganho da posição vendida
        elif action == 2:  # Vender
            if self.position == 0:
                self.position = -1  # Abrir posição vendida
                reward -= 0.25  # Custo de operação
            elif self.position == 1:
                self.position = 0  # Fechar posição comprada
                reward += price_change - 0.25  # Ganho da posição comprada
        else:  # Manter
            if self.position == 1:
                reward += price_change  # Ganho da posição comprada
            elif self.position == -1:
                reward += -price_change  # Ganho da posição vendida

        obs = self._next_observation()
        return obs, reward, done, {}

# Bloco 3: Criar o Agente DQN usando PyTorch

import torch
import torch.nn as nn
import torch.optim as optim
import collections
import random

# Configurações do dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Criar o ambiente
env = TradingEnv(data)

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# Definir a rede DQN
class DQN(nn.Module):
    def __init__(self, obs_size, n_actions):
        super(DQN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.model(x)

# Instanciar a rede
q_net = DQN(obs_size, n_actions).to(device)
target_net = DQN(obs_size, n_actions).to(device)
target_net.load_state_dict(q_net.state_dict())

# Definir o otimizador
optimizer = optim.Adam(q_net.parameters(), lr=1e-4)

# Hiperparâmetros para DQN
memory_size = 10000
batch_size = 64
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
target_update = 10  # Atualizar a rede alvo a cada 10 episódios

# Inicializar a memória de replay
memory = collections.deque(maxlen=memory_size)

# Função para selecionar ação usando epsilon-greedy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.choice([0, 1, 2])
    else:
        with torch.no_grad():
            q_values = q_net(state)
            return q_values.argmax().item()

# Bloco 4: Treinamento do Agente DQN com Logs

num_episodes = 100  # Defina o número de episódios de treinamento
epsilon = epsilon_start

for episode in range(num_episodes):
    obs = env.reset()
    obs = torch.FloatTensor(obs).unsqueeze(0).to(device)
    done = False
    total_reward = 0
    steps = 0
    actions_count = {0: 0, 1: 0, 2: 0}
    wins = 0
    losses = 0
    win_total = 0
    lose_total = 0

    while not done:
        steps += 1

        # Selecionar ação
        action = select_action(obs, epsilon)

        # Executar ação no ambiente
        obs_next, reward, done, _ = env.step(action)
        obs_next = torch.FloatTensor(obs_next).unsqueeze(0).to(device)

        # Armazenar na memória de replay
        memory.append((obs, action, reward, obs_next, done))

        # Atualizar o estado
        obs = obs_next
        total_reward += reward

        # Atualizar contagem de ações
        actions_count[action] += 1

        # Atualizar ganhos e perdas
        if reward > 0:
            wins += 1
            win_total += reward
        elif reward < 0:
            losses += 1
            lose_total += reward

        # Imprimir logs quando o gatilho for igual a 1
        gatilho = int(data['Gatilho'].iloc[env.current_step])
        if gatilho == 1:
            print(f"Episode: {episode + 1}, Step: {env.current_step}, Action: {action}, Reward: {reward:.2f}, Position: {env.position}")

        # Treinar a rede se a memória tiver tamanho suficiente
        if len(memory) >= batch_size:
            batch = random.sample(memory, batch_size)
            states, actions_batch, rewards_batch, next_states, dones = zip(*batch)

            states = torch.cat(states).to(device)
            actions_batch = torch.tensor(actions_batch, dtype=torch.long, device=device).unsqueeze(1)
            rewards_batch = torch.tensor(rewards_batch, dtype=torch.float32, device=device).unsqueeze(1)
            next_states = torch.cat(next_states).to(device)
            dones = torch.tensor(dones, dtype=torch.float32, device=device).unsqueeze(1)

            # Computar Q-valor atual
            q_values = q_net(states).gather(1, actions_batch)

            # Computar Q-valor alvo usando a rede alvo
            with torch.no_grad():
                next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
            target_q_values = rewards_batch + gamma * next_q_values * (1 - dones)

            # Calcular a perda
            loss = nn.MSELoss()(q_values, target_q_values)

            # Otimizar a rede
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decaimento de epsilon
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    # Atualizar a rede alvo
    if episode % target_update == 0:
        target_net.load_state_dict(q_net.state_dict())

    win_rate = wins / (wins + losses) if (wins + losses) > 0 else 0
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward:.2f}, Win Rate: {win_rate:.2f}, Wins: {wins}, Losses: {losses}, Epsilon: {epsilon:.4f}, Steps: {steps}")
    print(f"Ações: Manter={actions_count[0]}, Comprar={actions_count[1]}, Vender={actions_count[2]}")
    print(f"Ganhos Totais: {win_total:.2f}, Perdas Totais: {lose_total:.2f}\n")

print("Treinamento finalizado.")


Episode: 1, Step: 630, Action: 1, Reward: 0.00, Position: 1
Episode: 1, Step: 631, Action: 1, Reward: 0.00, Position: 1
Episode: 1, Step: 632, Action: 0, Reward: 0.75, Position: 1
Episode: 1, Step: 633, Action: 0, Reward: 4.50, Position: 1
Episode: 1, Step: 634, Action: 2, Reward: -1.50, Position: 0
Episode: 1, Step: 635, Action: 1, Reward: -0.25, Position: 1
Episode: 1, Step: 636, Action: 0, Reward: -0.75, Position: 1
Episode: 1, Step: 637, Action: 0, Reward: -1.25, Position: 1
Episode: 1, Step: 638, Action: 2, Reward: -0.50, Position: 0
Episode: 1, Step: 639, Action: 0, Reward: 0.00, Position: 0
Episode: 1, Step: 640, Action: 0, Reward: 0.00, Position: 0
Episode: 1, Step: 641, Action: 0, Reward: 0.00, Position: 0
Episode: 1, Step: 642, Action: 0, Reward: 0.00, Position: 0
Episode: 1, Step: 643, Action: 2, Reward: -0.25, Position: -1
Episode: 1, Step: 644, Action: 0, Reward: 1.75, Position: -1
Episode: 1, Step: 645, Action: 1, Reward: -0.25, Position: 0
Episode: 1, Step: 646, Action: 

KeyboardInterrupt: 