In [None]:
# Carregar o dataset
data = pd.read_csv('D:\\dados\\bar_M1_data_07-08-2024.csv')

In [1]:
# Bloco 1: Preparar os Dados

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Carregar o dataset
data = pd.read_csv('D:\\dados\\bar_M1_data_07-08-2024.csv')
data['DateTime'] = pd.to_datetime(data['DateTime'])

# Criar a coluna "Valor", que é uma cópia de "Close" e não será normalizada
data['Valor'] = data['Close']

# Normalizar as colunas necessárias (exceto "Valor" e "Gatilho")
scaler = MinMaxScaler()
cols_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume', 'PavioSuperior', 'PavioInferior', 'Corpo', 'Range', 'SMA50', 'SMA100', 'SMA200', 'StochasticoK', 'StochasticoD', 'RSI', 'MACD', 'MACDSignal', 'MACDHistogram']
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

# Converter todos os valores para tipo float32 para evitar problemas de tipo
data = data.astype({col: 'float32' for col in cols_to_normalize + ['Valor']})

# Exibir as primeiras linhas do dataframe para verificação
#print(data.head())

# Bloco 2: Criar o Ambiente

import gym
from gym import spaces

class TradingEnv(gym.Env):
    def __init__(self, data):
        super(TradingEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.action_space = spaces.Discrete(3)  # 0 = Manter, 1 = Comprar, 2 = Vender
        self.observation_space = spaces.Box(low=0, high=1, shape=(len(data.columns) - 3,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].drop(['Valor', 'DateTime', 'Gatilho']).values
        return obs.astype(np.float32)

    def step(self, action):
        self.current_step += 1

        done = self.current_step >= len(self.data) - 1
        reward = 0

        if action == 1:  # Comprar
            reward = self.data['Valor'].iloc[self.current_step] - self.data['Valor'].iloc[self.current_step - 1]
        elif action == 2:  # Vender
            reward = self.data['Valor'].iloc[self.current_step - 1] - self.data['Valor'].iloc[self.current_step]

        obs = self._next_observation()
        return obs, reward, done, {}

# Bloco 3: Criar o Agente

from stable_baselines3 import PPO

# Criar o ambiente
env = TradingEnv(data)

# Criar o agente usando PPO (usando GPU)
agent = PPO('MlpPolicy', env, verbose=0, device='cuda')

# Bloco 4: Treinamento com Logs

episodes = 10
for episode in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        action, _states = agent.predict(obs)
        obs, reward, done, _ = env.step(action)

        # Imprimir logs apenas quando o gatilho for igual a 1
        gatilho = int(data['Gatilho'].iloc[env.current_step])
        if gatilho == 1:
            print(f"Episode: {episode + 1}, Step: {env.current_step}, Action: {action}, Reward: {reward}")

print("Treinamento finalizado.")




Episode: 1, Step: 630, Action: 0, Reward: 0
Episode: 1, Step: 631, Action: 1, Reward: -2.75
Episode: 1, Step: 632, Action: 1, Reward: 0.75
Episode: 1, Step: 633, Action: 1, Reward: 4.5
Episode: 1, Step: 634, Action: 0, Reward: 0
Episode: 1, Step: 635, Action: 1, Reward: -2.5
Episode: 1, Step: 636, Action: 2, Reward: 0.75
Episode: 1, Step: 637, Action: 1, Reward: -1.25
Episode: 1, Step: 638, Action: 1, Reward: -0.25
Episode: 1, Step: 639, Action: 2, Reward: -0.75
Episode: 1, Step: 640, Action: 2, Reward: -1.5
Episode: 1, Step: 641, Action: 1, Reward: 2.25
Episode: 1, Step: 642, Action: 2, Reward: -2.75
Episode: 1, Step: 643, Action: 2, Reward: -4.75
Episode: 1, Step: 644, Action: 0, Reward: 0
Episode: 1, Step: 645, Action: 1, Reward: 0.0
Episode: 1, Step: 646, Action: 0, Reward: 0
Episode: 1, Step: 647, Action: 1, Reward: 0.25
Episode: 1, Step: 648, Action: 2, Reward: -2.5
Episode: 1, Step: 649, Action: 2, Reward: 4.25
Episode: 1, Step: 650, Action: 2, Reward: -2.25
Episode: 1, Step: 65

KeyboardInterrupt: 