In [6]:
# Bloco 1: Preparar os Dados

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time

# Carregar o dataset
data = pd.read_csv('D:\\dados\\bar_M1_data_07-08-2024.csv')
data['DateTime'] = pd.to_datetime(data['DateTime'])

# Criar a coluna "Valor", que é uma cópia de "Close" e não será normalizada
data['Valor'] = data['Close']

# Normalizar as colunas necessárias (exceto "Valor" e "Gatilho")
scaler = MinMaxScaler()
cols_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume', 'PavioSuperior', 'PavioInferior', 'Corpo', 'Range', 'SMA50', 'SMA100', 'SMA200', 'StochasticoK', 'StochasticoD', 'RSI', 'MACD', 'MACDSignal', 'MACDHistogram']
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

# Converter todos os valores para tipo float32 para evitar problemas de tipo
data = data.astype({col: 'float32' for col in cols_to_normalize + ['Valor']})

# Bloco 2: Criar o Ambiente

import gym
from gym import spaces

class TradingEnv(gym.Env):
    def __init__(self, data):
        super(TradingEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.current_step = 0
        self.action_space = spaces.Discrete(3)  # 0 = Manter, 1 = Comprar, 2 = Vender
        self.observation_space = spaces.Box(low=0, high=1, shape=(len(data.columns) - 3,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].drop(['Valor', 'DateTime', 'Gatilho']).values
        return obs.astype(np.float32)

    def step(self, action):
        self.current_step += 1

        done = self.current_step >= len(self.data) - 1
        reward = 0

        if action == 1:  # Comprar
            reward = (self.data['Valor'].iloc[self.current_step] - self.data['Valor'].iloc[self.current_step - 1]) - 0.25  # Subtrair custo de operação
        elif action == 2:  # Vender
            reward = (self.data['Valor'].iloc[self.current_step - 1] - self.data['Valor'].iloc[self.current_step]) - 0.25  # Subtrair custo de operação

        obs = self._next_observation()
        return obs, reward, done, {}

# Bloco 3: Criar o Agente usando PyTorch

import torch
import torch.nn as nn
import torch.optim as optim

# Configurações do dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Criar o ambiente
env = TradingEnv(data)

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# Definir a rede de política (Policy Network)
class PolicyNetwork(nn.Module):
    def __init__(self, obs_size, n_actions):
        super(PolicyNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.model(x)

# Definir a rede de valor (Value Network)
class ValueNetwork(nn.Module):
    def __init__(self, obs_size):
        super(ValueNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

# Instanciar as redes
policy_net = PolicyNetwork(obs_size, n_actions).to(device)
value_net = ValueNetwork(obs_size).to(device)

# Definir os otimizadores
policy_optimizer = optim.Adam(policy_net.parameters(), lr=1e-4)
value_optimizer = optim.Adam(value_net.parameters(), lr=1e-3)

# Bloco 4: Treinamento com Logs usando PyTorch

from torch.distributions import Categorical
import random

# Hiperparâmetros adicionais
episodes = 1000          # Número total de episódios para treinar
timestep = 0             # Contador de passos de tempo
update_timestep = 2000   # Número de passos de tempo para atualizar a política
gamma = 0.99             # Taxa de desconto para cálculo de recompensas futuras
K_epochs = 4             # Número de épocas para atualizar a política
eps_clip = 0.2           # Parâmetro de clipagem para PPO
epsilon = 0.1            # Taxa de exploração para estratégia epsilon-greedy

for episode in range(episodes):
    start_time = time.time()
    obs = env.reset()
    obs = torch.FloatTensor(obs).to(device)
    done = False
    total_reward = 0
    actions_count = {0: 0, 1: 0, 2: 0}
    wins = 0
    losses = 0
    win_total = 0
    lose_total = 0

    # Memória para armazenar experiências
    states = []
    actions = []
    rewards = []
    dones = []
    log_probs = []

    while not done:
        timestep += 1

        # Epsilon-greedy: escolher ação aleatória com probabilidade epsilon
        if random.uniform(0, 1) < epsilon:
            action = torch.tensor(random.choice([0, 1, 2]), dtype=torch.long).to(device)
            log_prob = torch.tensor(0.0).to(device)  # Nenhuma probabilidade de log válida para ações aleatórias
        else:
            # Selecionar ação da rede de política
            logits = policy_net(obs)
            dist = Categorical(logits=logits)
            action = dist.sample()
            log_prob = dist.log_prob(action)

        obs_next, reward, done, _ = env.step(action.item())
        obs_next = torch.FloatTensor(obs_next).to(device)

        # Armazenar na memória
        states.append(obs)
        actions.append(action)
        rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
        dones.append(torch.tensor(done, dtype=torch.float32, device=device))
        log_probs.append(log_prob)

        # Atualizar contagem de ações e total de recompensas
        action_int = action.item()
        actions_count[action_int] += 1
        total_reward += reward
        if reward > 0:
            wins += 1
            win_total += reward
        elif reward < 0:
            losses += 1
            lose_total += reward

        # Imprimir logs apenas quando o gatilho for igual a 1
        gatilho = int(data['Gatilho'].iloc[env.current_step])
        if gatilho == 1:
            print(f"Episode: {episode + 1}, Step: {env.current_step}, Action: {action_int}, Reward: {reward}")

        # Atualizar o estado
        obs = obs_next

        # Atualizar a política se o timestep atingir o limite
        if timestep % update_timestep == 0 or done:
            # Converter listas em tensores
            states_tensor = torch.stack(states)
            actions_tensor = torch.stack(actions)
            rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=device)
            dones_tensor = torch.tensor(dones, dtype=torch.float32, device=device)
            log_probs_tensor = torch.stack(log_probs).detach()

            # Calcular os valores e vantagens
            returns = []
            discounted_reward = torch.tensor(0.0, device=device)
            for reward, is_done in zip(reversed(rewards_tensor), reversed(dones_tensor)):
                if is_done.item():
                    discounted_reward = torch.tensor(0.0, device=device)
                discounted_reward = reward + (gamma * discounted_reward)
                returns.insert(0, discounted_reward)
            returns = torch.stack(returns).detach()

            # Converter estados para o dispositivo
            states_tensor = states_tensor.to(device)
            actions_tensor = actions_tensor.to(device)

            # Calcular valores atuais
            values = value_net(states_tensor).squeeze()

            # Calcular vantagens
            advantages = (returns - values).detach()

            # Normalizar vantagens
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            # Atualizar a política e a rede de valor
            policy_loss_accum = 0
            value_loss_accum = 0
            for _ in range(K_epochs):
                # Calcular perdas
                logits = policy_net(states_tensor)
                dist = Categorical(logits=logits)
                new_log_probs = dist.log_prob(actions_tensor)
                entropy = dist.entropy().mean()

                ratio = (new_log_probs - log_probs_tensor).exp()
                surr1 = ratio * advantages
                surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                policy_loss_accum += policy_loss

                value_loss = nn.MSELoss()(values, returns)
                value_loss_accum += value_loss

            # Atualizar a política
            policy_optimizer.zero_grad()
            policy_loss_accum.backward(retain_graph=True)
            policy_optimizer.step()

            # Atualizar a rede de valor
            value_optimizer.zero_grad()
            value_loss_accum.backward()
            value_optimizer.step()

            # Limpar memória
            states = []
            actions = []
            rewards = []
            dones = []
            log_probs = []
            timestep = 0

    win_rate = wins / (wins + losses) if (wins + losses) > 0 else 0
    end_time = time.time()
    training_time = end_time - start_time
    print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Win Rate: {win_rate:.2f}, Win Total: {win_total}, Lose Total: {lose_total}, Ação 0: {actions_count[0]}, Ação 1: {actions_count[1]}, Ação 2: {actions_count[2]}, Wins: {wins}, Losses: {losses}")
    print(f"Tempo de Treinamento: {training_time:.2f} segundos")

print("Treinamento finalizado.")




Episode: 1, Step: 630, Action: 2, Reward: 0.5
Episode: 1, Step: 631, Action: 2, Reward: 2.5
Episode: 1, Step: 632, Action: 2, Reward: -1.0
Episode: 1, Step: 633, Action: 2, Reward: -4.75
Episode: 1, Step: 634, Action: 2, Reward: 1.0
Episode: 1, Step: 635, Action: 2, Reward: 2.25
Episode: 1, Step: 636, Action: 2, Reward: 0.5
Episode: 1, Step: 637, Action: 2, Reward: 1.0
Episode: 1, Step: 638, Action: 2, Reward: 0.0
Episode: 1, Step: 639, Action: 2, Reward: -1.0
Episode: 1, Step: 640, Action: 2, Reward: -1.75
Episode: 1, Step: 641, Action: 0, Reward: 0
Episode: 1, Step: 642, Action: 2, Reward: -3.0
Episode: 1, Step: 643, Action: 2, Reward: -5.0
Episode: 1, Step: 644, Action: 2, Reward: 1.5
Episode: 1, Step: 645, Action: 2, Reward: -0.25
Episode: 1, Step: 646, Action: 2, Reward: -0.75
Episode: 1, Step: 647, Action: 2, Reward: -0.5
Episode: 1, Step: 648, Action: 2, Reward: -2.75
Episode: 1, Step: 649, Action: 2, Reward: 4.0
Episode: 1, Step: 650, Action: 2, Reward: -2.5
Episode: 1, Step: 6

KeyboardInterrupt: 