In [None]:
from gymnasium.envs.registration import register
import gymnasium as gym
from stable_baselines3 import *

In [None]:
# tensorboard --logdir ./ppo_custom_env_tensorboard
register(
     id="my_envs/CustomGridWorld-v0",
     entry_point="my_envs:CustomGridWorldEnv",
     max_episode_steps=100,
)

env = gym.make("my_envs/CustomGridWorld-v0", render_mode="human")
model = A2C('MultiInputPolicy', env, verbose=1, tensorboard_log="./ppo_custom_env_tensorboard/")
model.learn(total_timesteps=100_000, log_interval=10, tb_log_name="first_run")
model.save("ppo_custom_env")

In [None]:
model.policy

In [None]:
# # save the model
# model.save("ppo_custom_env")

# # load the model
# model = PPO.load("ppo_custom_env")

In [None]:
model = PPO.load("ppo_custom_env")

observation, info = env.reset(seed=42)
for _ in range(3000*5):
    action_arr, _states = model.predict(observation)
    action = action_arr.item()
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()
    
    if terminated or truncated:
        observation, info = env.reset()
        
env.close()

In [None]:
import gymnasium as gym
from gymnasium.envs.registration import register
from stable_baselines3 import *
from copy import deepcopy
from gymnasium.envs.registration import register
import pandas as pd
import talib

##TODO: add path to csv file
index_name = "Time"
df = pd.read_csv(path, parse_dates=True, index_col=index_name)
prices = df["Close"].values
sma = talib.SMA(prices, timeperiod=200)
# add sma to df
df["SMA"] = sma
# remove NaNs
df = df.dropna()


In [None]:
df

In [None]:


register(
    id='trade_env/CustomForex-v0',
    entry_point='trade_env:CustomForexEnv',
    #  max_episode_steps=100,
    kwargs={
        'df': deepcopy(df),
        'window_size': 24,
        'frame_bound': (24, len(df))
    }
)

env = gym.make(
              'trade_env/CustomForex-v0',
               df = deepcopy(df),
               window_size = 10,
               frame_bound = (10, 3000),
            #    num_envs=3,
            #    asynchronous=False,
            #    wrappers=None,
        )

In [None]:

# from stable_baselines3 import A2C
from sb3_contrib import RecurrentPPO
policy_kwargs = dict(net_arch=[64, 'lstm', dict(vf=[128, 128, 128], pi=[64, 64])])
model = RecurrentPPO('MlpLstmPolicy', env, verbose=1, tensorboard_log="./trade_env_tensorboard/")
model.learn(total_timesteps=30_000,  log_interval=10, tb_log_name="trade_name")

# model = PPO('MlpPolicy', env, verbose=1)
# model.learn(total_timesteps=1_000_000)

In [None]:

import matplotlib.pyplot as plt

observation, info = env.reset(seed = 2)

while True:
    action, _state = model.predict(observation)
    # print("action:", action)
    # action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # observation, reward, done, info = env.step(action)
    # env.render()
    if terminated:
        print("info:", info)
        break

plt.cla()
env.render_all()
plt.show()

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
from copy import deepcopy
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
from enum import Enum
from sklearn.preprocessing import StandardScaler, PowerTransformer
import talib


    
class MyForexEnv(gym.Env):
    metadata = {"render_modes": ["human"]}
    
    def __init__(self,  df, window_size, frame_bound):
        assert len(frame_bound) == 2
        super().__init__()
        
        self.frame_bound = frame_bound
        # self.seed()
        self.df = df
        self.window_size = window_size
        self.prices, self.signal_features = self._process_data()
        self.shape = (window_size, self.signal_features.shape[1])
        # Inf should be a large enough upper bound
        INF = 1e9

        # spaces
        self.observation_space = spaces.Box(low=-INF, 
                                            high=INF,
                                            shape=self.shape, 
                                            dtype=np.float64)
        self.action_space = spaces.MultiDiscrete([2, 2, 4, 2])

        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._current_tick = None
        self._last_trade_tick = None
        self._done = None
        
        self._position = None
        self._position_history = None
        self._total_reward = None
        self._total_profit = None
        
        self._first_rendering = None
        self._first_time = None
        
        self.history = None
        self.trade_fee = 0.0003 
        self._seed()
        
        # self.reset()
        # self.step([0, 0, 13, 13])
        # self.step([1,  0, 10, 14])
    #    print("self._position_history:", self._position_history)
        # self.step(0)
        # self.step(0)
        # self.step(1)
        # self.step(1)
        # self.step(1)
        # self.step(0)
   
    def _process_data(self):
        prices = self.df.loc[:, 'Close'].to_numpy()
        sma = self.df.loc[:, 'SMA'].to_numpy()
        ema = self.df.loc[:, 'EMA'].to_numpy()
        
        prices = prices[self.frame_bound[0] - self.window_size:self.frame_bound[1]]
        diff = np.insert(np.diff(prices), 0, 0)
        sma = sma[self.frame_bound[0] - self.window_size:self.frame_bound[1]]
        ema = ema[self.frame_bound[0] - self.window_size:self.frame_bound[1]]
        
        signal_features = np.column_stack((sma, ema))
        # scale signal features
        # scaler = StandardScaler()
        # signal_features = scaler.fit_transform(signal_features)
        # transformer = PowerTransformer()
        # signal_features = transformer.fit_transform(signal_features)
        return sma, signal_features
    
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]     
    
    def reset(self, seed=None, options=None):
        print("reset\n")
        super().reset(seed=seed)
        self._done = False
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1

        self._total_reward = 0.
        self._total_profit = 1.  # unit
        
        self._first_rendering = True
        self._first_time = True
        
        self.history = {}
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )
        self.existing_trade = None
        return observation, info


    def step(self, action):
        # print("action:", action)
        self._done = False
        self._current_tick += 1

        if self._current_tick == self._end_tick:
            self._done = True

        step_reward = self._calculate_reward(action)
        self._total_reward += step_reward
        self._update_profit(action)

        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )
        
        # self._position_history.append(self._position)
        # if not self.history:
        #     self.history = {key: [] for key in info.keys()}
        # for key, value in info.items():
        #     self.history[key].append(value)
            
        return observation, step_reward, self._done, False, info


    def _calculate_reward(self, action):
        actual_state, trade_position, take_profit, stop_loss = action
        current_price = self.prices[self._current_tick]
        # initialize step reward to 0
        step_reward = 0  

        ## EXISTING TRADE STATE
        if self.existing_trade:
            existing_trade_position = self.existing_trade['trade_position']
            existing_tp = self.existing_trade['take_profit']
            existing_sl = self.existing_trade['stop_loss']
            entry_price = self.existing_trade['entry_price']
            
            
            price_diff = current_price - entry_price
            # if existing_trade_position is 1, and current_price triggers take_profit or stop_loss
            if existing_trade_position == 1: 
                # check if current_price is close to take_profit than stop_loss
                # if abs(current_price - existing_tp) < abs(current_price - existing_sl):
                if current_price >= existing_tp:
                    # step_reward += price_diff * 10000
                    step_reward += 1
                  
                elif current_price <= existing_sl:
                    # step_reward += price_diff * 5000
                    step_reward += 0.5
                elif price_diff >= 0:
                    step_reward += 1   
                else:
                    step_reward += 0.5       
            else:
                if current_price <= existing_tp:
                    step_reward += 1
                    # step_reward += price_diff * 10000
                elif current_price >= existing_sl:
                    step_reward += 0.5
                    # step_reward += price_diff * 5000
                elif price_diff <= 0:
                    step_reward += 1
                else:
                    step_reward += 0.5
            # Deductions
            # if actual_state is TRADE then deduct 0.5
            if actual_state == 1:
                step_reward -= 0.2
            # if SLdiff is greater than 2*TPdiff then deduct 0.3
            if abs(existing_sl - existing_tp) > 2 * abs(existing_tp - entry_price):
                step_reward -= 0.3                      
                  
        ## EXISTING IDLE STATE
        else:
            # if current_state is IDLE
            if actual_state == 0:
                if self._first_time:
                    step_reward += 0.2
                else:
                    step_reward += 0.5    
            # if current_state is TRADE
            else:
                step_reward += 0.6    
        
        print("\nstep_reward:", step_reward)     
        print("action:", action)
        return step_reward
    
    
    def _update_profit(self, action):
        actual_state, trade_position, take_profit, stop_loss = action
        if self.existing_trade:
            existing_trade_position = self.existing_trade['trade_position']
            existing_tp = self.existing_trade['take_profit']
            existing_sl = self.existing_trade['stop_loss']
            entry_price = self.existing_trade['entry_price']
            current_price = self.prices[self._current_tick]
            
            # if existing_trade_position is 1, and current_price triggers take_profit or stop_loss
            percent_price_diff = (current_price - entry_price)/ entry_price
            if existing_trade_position == 1: 
                if current_price >= existing_tp:
                    self._total_profit += percent_price_diff
                    # close the trade
                    self.existing_trade = None
                elif current_price <= existing_sl:
                    self._total_profit += percent_price_diff
                    # close the trade
                    self.existing_trade = None
            else:
                if current_price <= existing_tp:
                    self._total_profit += percent_price_diff
                    # close the trade
                    self.existing_trade = None
                elif current_price >= existing_sl:
                    self._total_profit += percent_price_diff
                    # close the trade
                    self.existing_trade = None
                    
        if actual_state == 1 and self.existing_trade is None:
            self._first_time = False
            current_price = self.prices[self._current_tick]
            delta = 0.01
            if trade_position == 1:
                # if there is a buy trade
                take_profit_price = current_price + ((take_profit + 1) * delta)
                stop_loss_price = current_price - ((stop_loss + 1) * delta)
            else:
                # if there is a sell trade
                take_profit_price = current_price - ((take_profit + 1) * delta)
                stop_loss_price = current_price + ((stop_loss + 1) * delta)    
            
            self.existing_trade = {
                'trade_position': trade_position,
                'take_profit': take_profit_price,
                'stop_loss': stop_loss_price,
                'entry_price': current_price,
                'trade_tick': self._current_tick,
            }
            print("action:", action)
            print("total_profit:", self._total_profit)
            print("existing_trade:", self.existing_trade)
        
        
        
    def _get_observation(self):
        return self.signal_features[(self._current_tick-self.window_size+1):self._current_tick+1]


    
env = MyForexEnv(
       df = deepcopy(df),
       window_size = 500,
       frame_bound = (500, 1_000),
    #    frame_bound = (1_000, 2_000),
    #    frame_bound = (7_000, 15_000),
)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
from enum import Enum
from sklearn.preprocessing import StandardScaler, PowerTransformer
import talib


class Actions(Enum):
    Sell = 0
    Buy = 1


class Positions(Enum):
    Short = 0
    Long = 1

    def switch(self):
        return Positions.Short if self == Positions.Long else Positions.Long
  
    
class MyForexEnv(gym.Env):
    metadata = {"render_modes": ["human"]}
    
    def __init__(self,  df, window_size, frame_bound):
        assert len(frame_bound) == 2
        super().__init__()
        
        self.frame_bound = frame_bound
        # self.seed()
        self.df = df
        self.window_size = window_size
        self.prices, self.signal_features = self._process_data()
        self.shape = (window_size, self.signal_features.shape[1])
        # Inf should be a large enough upper bound
        INF = 1e9

        # spaces
        self.observation_space = spaces.Box(low=-INF, 
                                            high=INF,
                                            shape=self.shape, 
                                            dtype=np.float64)
        self.action_space = spaces.Discrete(2)

        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._current_tick = None
        self._last_trade_tick = None
        self._done = None
        
        self._position = None
        self._position_history = None
        self._total_reward = None
        self._total_profit = None
        self._first_rendering = None
        self.history = None
        self.trade_fee = 0.0003 
        self._seed()
        
        # self.reset()
        # self.step(0)
        # self.step(0)
        # self.step(1)
        # self.step(1)
        # self.step(1)
        # self.step(0)
   
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]     
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._done = False
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1
        self._position = Positions.Long
        self._position_history = (self.window_size * [None]) + [self._position]
        self._total_reward = 0.
        self._total_profit = 1.  # unit
        self._first_rendering = True
        self.history = {}
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
            position = self._position.value
        )
        return observation, info

        
    def _process_data(self):
        prices = self.df.loc[:, 'Close'].to_numpy()
        sma = self.df.loc[:, 'SMA'].to_numpy()
        prices = prices[self.frame_bound[0] - self.window_size:self.frame_bound[1]]
        sma = sma[self.frame_bound[0] - self.window_size:self.frame_bound[1]]
        diff = np.insert(np.diff(prices), 0, 0)
        
        signal_features = np.column_stack(( sma, sma))
        # scale signal features
        # scaler = StandardScaler()
        # signal_features = scaler.fit_transform(signal_features)
        # transformer = PowerTransformer()
        # signal_features = transformer.fit_transform(signal_features)
        
        return prices, signal_features

    def step(self, action):
        # print("action:", action)
        self._done = False
        self._current_tick += 1

        if self._current_tick == self._end_tick:
            self._done = True

        step_reward = self._calculate_reward(action)
        self._total_reward += step_reward
        self._update_profit(action)

        if self._has_traded(action):
            self._position = self._position.switch()
            self._last_trade_tick = self._current_tick

        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
            position = self._position.value
        )
        
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
            
        return observation, step_reward, self._done, False, info


    def _calculate_reward(self, action):
        # initialize step reward to 0
        step_reward = 0  
        trade = self._has_traded(action)

        if trade:
            prev_trade_price = self.prices[self._last_trade_tick]
            current_price = self.prices[self._current_tick]
            price_diff = current_price - prev_trade_price

            if self._position == Positions.Short:
                if price_diff < 0:
                    # this is correct
                    step_reward += -price_diff * 10000
                else:
                    # give more penalty for wrong action
                    step_reward += -price_diff * 10000
                    
            elif self._position == Positions.Long:
                if price_diff > 0:
                    # this is correct
                    step_reward += price_diff * 10000
                else:
                    # give more penalty for wrong action
                    step_reward += price_diff * 10000
        
        # print("step_reward:", step_reward)
        # print("profit:", self._total_profit)
        return step_reward
    
    def _has_traded(self, action):
        return ((action == Actions.Buy.value and 
                 self._position == Positions.Short) or
            (action == Actions.Sell.value and 
             self._position == Positions.Long))
        
    def _get_observation(self):
        return self.signal_features[(self._current_tick-self.window_size+1):self._current_tick+1]

    def _update_profit(self, action):
        if self._has_traded(action) or self._done:
            current_price = self.prices[self._current_tick]
            prev_trade_price = self.prices[self._last_trade_tick]
            price_diff = current_price - prev_trade_price
            if self._position == Positions.Short:
                self._total_profit = self._total_profit * (1 - price_diff)
            elif self._position == Positions.Long:
                self._total_profit = self._total_profit * (1 + price_diff)
            
            # print("current_price:", self._total_profit ) 
            # quantity = self._total_profit / last_trade_price
            # self._total_profit = quantity * (current_price - self.trade_fee)

    
env = MyForexEnv(
       df = deepcopy(df),
       window_size = 100,
       frame_bound = (100, 3000),
    #    frame_bound = (2000, 5000),
)


In [None]:
import gym
import pandas as pd
import numpy as np
from gym.utils import seeding
from gym import spaces
from enum import Enum
from typing import List, Dict
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('seaborn')

class TradingEnvAction(Enum):
    STAY = 0
    BUY = 1
    SELL = 2
    CLOSE = 3

class TradingEnvTicket(object):
    def __init__(self, order_type, open_price, take_profit, stop_loss, lots):
        self.order_type = order_type
        self.open_price = open_price
        self.take_profit = take_profit
        self.stop_loss = stop_loss
        self.lots = lots
        self.trade_fee = 0.0003  # unit

class TradingEnvAccountInformation(object):
    def __init__(self, initial_balance):
        self.balance = initial_balance
        self.fixed_balance = initial_balance
        self.total_pips_buy = 10
        self.total_pips_sell = 10

    def items(self):
        return [('balance', self.balance), ('fixed_balance', self.fixed_balance), ('total_pips_buy', self.total_pips_buy), ('total_pips_sell', self.total_pips_sell)]


class TradingEnv(gym.Env):

    metadata = {'render.modes': ['human']}

    def __init__(self, df, window_size, frame_bound):
        assert len(frame_bound) == 2

        self.frame_bound = frame_bound

        self.trade_fee_bid_percent = 0.01  # unit
        self.trade_fee_ask_percent = 0.005  # unit

        assert df.ndim == 2

        self.seed()
        self.df = df
        self.window_size = window_size
        self.prices, self.signal_features = self._process_data()
        self.shape = (window_size, self.signal_features.shape[1])

        # spaces
        self.action_space = spaces.Discrete(len(TradingEnvAction))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)

        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._done = None
        self._current_tick = None
        self._last_trade_tick = None
        self._position = None
        self._position_history = None
        self._total_reward = None
        self._total_profit = None
        self._first_rendering = None
        self.history = None


    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]


    def reset(self):
        self._done = False
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1
        self._position = TradingEnvAction.STAY.value
        self._position_history = (self.window_size * [None]) + [self._position]
        self._total_reward = 0.
        self._total_profit = 1.  # unit
        self._first_rendering = True
        self.history = {}
        return self._get_observation()


    def step(self, action):
        self._done = False
        self._current_tick += 1

        if self._current_tick == self._end_tick:
            self._done = True

        step_reward = self._calculate_reward(action)
        self._total_reward += step_reward

        self._update_profit(action)

        trade = False
        if (action != None):
            trade = True

        if trade:
            self._position = action
            self._last_trade_tick = self._current_tick

        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,           
            position = Counter(self._position_history),
            last_15_position_predictions = self._position_history[-15:],
            position_predictions = self._position_history
        )
        self._update_history(info)

        return observation, step_reward, self._done, info


    def _get_observation(self):
        return self.signal_features[(self._current_tick-self.window_size):self._current_tick]


    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}

        for key, value in info.items():
            self.history[key].append(value)


    def render(self, mode='human'):

        def _plot_position(position, tick):
            color = None
            marker = 'o'

            if position == TradingEnvAction.SELL.value:
                color = 'red'
                marker = 'v'
            elif position == TradingEnvAction.BUY.value:
                color = 'green'
                marker = '^'
            if position == TradingEnvAction.STAY.value:
                color = 'yellow'
                marker = 'o'
            elif position == TradingEnvAction.CLOSE.value:
                color = 'blue'
                marker = 'o'
            elif position == None:
                color = 'purple'                
            if color:
                plt.scatter(tick, self.prices[tick], marker=marker,color=color)

        if self._first_rendering:
            self._first_rendering = False
            plt.cla()
            plt.plot(self.prices)
            start_position = self._position_history[self._start_tick]
            _plot_position(start_position, self._start_tick)

        _plot_position(self._position, self._current_tick)

        plt.suptitle(
            "Total Reward: %.6f" % self._total_reward + ' ~ ' +
            "Total Profit: %.6f" % self._total_profit
        )

        plt.pause(0.01)


    def render_all(self, mode='human'):
        window_ticks = np.arange(len(self._position_history))
        plt.plot(self.prices)

        short_ticks = []
        long_ticks = []
        close_ticks = []
        stay_ticks = []        
        for i, tick in enumerate(window_ticks):
            if self._position_history[i] == TradingEnvAction.SELL.value:
                short_ticks.append(tick)
            elif self._position_history[i] == TradingEnvAction.BUY.value:
                long_ticks.append(tick)
            elif self._position_history[i] == TradingEnvAction.CLOSE.value:
                close_ticks.append(tick)
            elif self._position_history[i] == TradingEnvAction.STAY.value:
                stay_ticks.append(tick)

        
        plt.plot(short_ticks, self.prices[short_ticks], 'rv')
        plt.plot(long_ticks, self.prices[long_ticks], 'g^')
        plt.plot(close_ticks, self.prices[close_ticks], 'bo')
        plt.plot(stay_ticks, self.prices[stay_ticks], 'yo')        

        plt.suptitle(
            "Total Reward: %.6f" % self._total_reward + ' ~ ' +
            "Total Profit: %.6f" % self._total_profit
        )
        
        
    def close(self):
        plt.close()


    def save_rendering(self, filepath):
        plt.savefig(filepath)


    def pause_rendering(self):
        plt.show()


    def _process_data(self):
        raise NotImplementedError


    def _calculate_reward(self, action):
        step_reward = 0
        # print("p:",TradingEnvAction(self._position),"a:",TradingEnvAction(action),"-1:",TradingEnvAction(self._position_history[-1]),"c:",list(filter(None,self._position_history)))
        trade = False
        if (self._position != None and action != None):
            trade = True

        if trade:

            current_price = self.prices[self._current_tick]
            last_trade_price = self.prices[self._last_trade_tick]
            price_diff = current_price - last_trade_price
            # print(action,self._position_history[-1:],price_diff)
            if list(filter(None,self._position_history)) == []:
                step_reward = -abs(price_diff)

            if (action == TradingEnvAction.BUY.value) or (action == TradingEnvAction.SELL.value):
                step_reward = abs(price_diff)

            if (action == TradingEnvAction.STAY.value) and (self._position_history[-1] == TradingEnvAction.BUY.value) and (current_price > last_trade_price ):
                step_reward += abs(price_diff)/15

            if (action == TradingEnvAction.STAY.value) and (self._position_history[-1] == TradingEnvAction.SELL.value) and (current_price < last_trade_price ):
                step_reward += abs(price_diff)/15

            if (action == TradingEnvAction.CLOSE.value) and (self._position_history[-1] == TradingEnvAction.SELL.value) and (current_price > last_trade_price ):
                step_reward += abs(price_diff)/15

            if (action == TradingEnvAction.CLOSE.value) and (self._position_history[-1] == TradingEnvAction.BUY.value) and (current_price < last_trade_price ):
                step_reward += abs(price_diff)/15                                            

            if (action == TradingEnvAction.STAY.value) and (self._position_history[-1] == TradingEnvAction.CLOSE.value):
                step_reward += -abs(price_diff) 
            
            if (action == TradingEnvAction.STAY.value) and (self._position_history[-1] == TradingEnvAction.STAY.value) and ((current_price < self.prices[-2]) or (current_price > self.prices[-2])):
                step_reward += abs(price_diff) 

            if (action == TradingEnvAction.STAY.value) and (self._position_history[-1] == TradingEnvAction.STAY.value):
                step_reward += abs(price_diff) 

    
        return step_reward


    def _update_profit(self, action):
        trade = False
        if (self._position != None and action != None):
            trade = True

        if trade or self._done:
            current_price = self.prices[self._current_tick]
            last_trade_price = self.prices[self._last_trade_tick]

            if self._position == TradingEnvAction.BUY.value:
                shares = (self._total_profit * (1 - self.trade_fee_ask_percent)) / last_trade_price
                self._total_profit = (shares * (1 - self.trade_fee_bid_percent)) * current_price



    def max_possible_profit(self):
        self.trade_fee = 0.0003  # unit
        current_tick = self._start_tick
        last_trade_tick = current_tick - 1
        profit = 1.

        while current_tick <= self._end_tick:
            position = None
            if self.prices[current_tick] < self.prices[current_tick - 1]:
                while (current_tick <= self._end_tick and
                       self.prices[current_tick] < self.prices[current_tick - 1]):
                    current_tick += 1
                position = TradingEnvAction.SELL.value
            else:
                while (current_tick <= self._end_tick and
                       self.prices[current_tick] >= self.prices[current_tick - 1]):
                    current_tick += 1
                position = TradingEnvAction.BUY.value

            current_price = self.prices[current_tick - 1]
            last_trade_price = self.prices[last_trade_tick]

            if self._position_history[-1] == TradingEnvAction.CLOSE.value:
                if position == TradingEnvAction.SELL.value:
                    quantity = profit * (last_trade_price - self.trade_fee)
                    profit = quantity / current_price

            elif self._position_history[-1] == TradingEnvAction.STAY.value:
                if position == TradingEnvAction.BUY.value:
                    quantity = profit / last_trade_price
                    profit = quantity * (current_price - self.trade_fee)

            last_trade_tick = current_tick - 1

        return profit

# My Work Environment MultiDiscrete Action Space

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
from copy import deepcopy
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
from enum import Enum
from sklearn.preprocessing import StandardScaler, PowerTransformer
import talib


    
class MyForexEnv(gym.Env):
    metadata = {"render_modes": ["human"]}
    
    def __init__(self,  df, window_size, frame_bound):
        assert len(frame_bound) == 2
        super().__init__()
        
        self.frame_bound = frame_bound
        # self.seed()
        self.df = df
        self.window_size = window_size
        self.prices, self.signal_features = self._process_data()
        self.shape = (window_size, self.signal_features.shape[1])

        # spaces
        # Inf should be a large enough upper bound
        INF = 1e9
        self.observation_space = spaces.Box(low=-INF, 
                                            high=INF,
                                            shape=self.shape, 
                                            dtype=np.float64)
        self.action_space = spaces.MultiDiscrete([2, 2, 4, 2])

        # episode
        self._start_tick = self.window_size
        # self._start_tick = 0
        self._end_tick = len(self.prices) - 1
        
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        self._done = None
        
        self._position = None
        self._position_history = None
        self._total_reward = None
        self._total_profit = None
        
        self._first_rendering = None
        
        self.trade_history = []
        self.trade_fee = 0.0003 
        self._seed()
        
        # self.reset()
        # self.step([0, 0, 13, 13])
        # self.step([1,  0, 10, 14])
    #    print("self._position_history:", self._position_history)
        # self.step(0)
        # self.step(0)
        # self.step(1)
        # self.step(1)
        # self.step(1)
        # self.step(0)
   
    def _process_data(self):
        # prices = self.df.loc[:, 'Close'].to_numpy()
        sma = self.df.loc[:, 'SMA'].to_numpy()
        start, end = self.frame_bound
        sma = sma[start-self.window_size:end]
        diff = np.diff(sma, prepend=0)
        signal_features = np.column_stack((sma, diff))
        return sma, signal_features
    
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]     
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        print("\nreset")
        self._done = False
        self._total_reward = 0
        self._total_profit = 1
        
        self._first_rendering = True
        self._first_time = True
        
        self.history = {}
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )
        self.existing_trade = None
        return observation, info


    def step(self, action):
        # print("action:", action)
        self._done = False
        has_traded = self._update_profit(action)
        if has_traded:
            self._current_tick = self._get_updated_current_tick()
        else:
            self._current_tick += 1
        
        step_reward, episode_ended = self._calculate_reward(action)
        self._total_reward += step_reward
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )

        observation = self._get_observation()
        # The experiment is done when _current_tick is at a point in the data stream 
        # where it cannot look forward window_size ticks
        # If that is the case, the reset the _current_tick to the start_tick
        if self._current_tick == self._end_tick:
            self._done = True
            self._current_tick = self._start_tick
            # clear self.trade_history list
            self.trade_history = []
            print("done")
        
            
        return observation, step_reward, episode_ended, self._done, info

    def _get_updated_current_tick(self):
        my_ticker = self._current_tick
        if self.existing_trade:
            tp_sl_hit = False
            while (not tp_sl_hit) and (my_ticker < self._end_tick):
                existing_trade_position = self.existing_trade['trade_position']
                existing_tp = self.existing_trade['take_profit']
                existing_sl = self.existing_trade['stop_loss']
                entry_price = self.existing_trade['entry_price']
                current_price = self.prices[my_ticker]
                
                price_diff = current_price - entry_price
                # if existing_trade_position is 1, and current_price triggers take_profit or stop_loss
                if (existing_trade_position == 1) and (current_price >= existing_tp or current_price <= existing_sl): 
                    # check if current_price is close to take_profit than stop_loss
                    tp_sl_hit = True
                elif (existing_trade_position == 0) and (current_price <= existing_tp or current_price >= existing_sl):
                    tp_sl_hit = True
                else:
                    my_ticker += 1
                
        return my_ticker


    def _calculate_reward(self, action):
        actual_state, trade_position, take_profit, stop_loss = action
        current_price = self.prices[self._current_tick]
        # initialize step reward to 0
        step_reward = 0
        episode_ended = False  

        ## EXISTING TRADE STATE
        if self.existing_trade:
            existing_trade_position = self.existing_trade['trade_position']
            existing_tp = self.existing_trade['take_profit']
            existing_sl = self.existing_trade['stop_loss']
            entry_price = self.existing_trade['entry_price']
            
            
            price_diff = current_price - entry_price
            # if existing_trade_position is 1, and current_price triggers take_profit or stop_loss
            if existing_trade_position == 1: 
                # check if current_price is close to take_profit than stop_loss
                if current_price >= existing_tp:
                    step_reward += price_diff * 10000
                    episode_ended = True
                    self.existing_trade = None
                    
                elif current_price <= existing_sl:
                    # step_reward += (10000 + price_diff) * 0.01
                    episode_ended = True
                    self.existing_trade = None
                    
            else:
                if current_price <= existing_tp:
                    step_reward += price_diff * 10000
                    episode_ended = True
                    self.existing_trade = None
                    
                elif current_price >= existing_sl:
                    # step_reward += price_diff * 15000
                    # step_reward += (10000 + price_diff) * 0.01
                    episode_ended = True
                    self.existing_trade = None
        
        # print("\nstep_reward:", step_reward)     
        # print("action:", action)
        return step_reward, episode_ended
    
    
    def _update_profit(self, action):
        actual_state, trade_position, take_profit, stop_loss = action
        has_traded = False
                    
        if actual_state == 1 and self.existing_trade is None:
            current_price = self.prices[self._current_tick]
            delta = 0.05
            if trade_position == 1:
                # if there is a buy trade
                take_profit_price = current_price + ((take_profit + 1) * delta)
                stop_loss_price = current_price - ((stop_loss + 1) * delta)
            else:
                # if there is a sell trade
                take_profit_price = current_price - ((take_profit + 1) * delta)
                stop_loss_price = current_price + ((stop_loss + 1) * delta)    
            
            self.existing_trade = {
                'trade_position': trade_position,
                'take_profit': take_profit_price,
                'stop_loss': stop_loss_price,
                'entry_price': current_price,
                'trade_tick': self._current_tick,
            }
            print("action:", action)
            # print("total_profit:", self._total_profit)
            print("existing_trade:", self.existing_trade)
            has_traded = True
            # add existing trade to history
            self.trade_history.append(self.existing_trade)
            
        
        # if self.existing_trade:
        #     existing_trade_position = self.existing_trade['trade_position']
        #     existing_tp = self.existing_trade['take_profit']
        #     existing_sl = self.existing_trade['stop_loss']
        #     entry_price = self.existing_trade['entry_price']
        #     current_price = self.prices[self._current_tick]
            
        #     # if existing_trade_position is 1, and current_price triggers take_profit or stop_loss
        #     percent_price_diff = (current_price - entry_price)/ entry_price
        #     if existing_trade_position == 1: 
        #         if current_price >= existing_tp:
        #             self._total_profit += percent_price_diff
        #             self.existing_trade = None
        #         elif current_price <= existing_sl:
        #             self._total_profit += percent_price_diff
        #             self.existing_trade = None
        #     else:
        #         if current_price <= existing_tp:
        #             self._total_profit += percent_price_diff
        #             self.existing_trade = None
        #         elif current_price >= existing_sl:
        #             self._total_profit += percent_price_diff
        #             self.existing_trade = None
        return has_traded
       
        
        
    def _get_observation(self):
        observation = self.signal_features[(self._current_tick-self.window_size+1):self._current_tick+1]
        return observation


    
env = MyForexEnv(
       df = deepcopy(df),
       window_size = 500,
       frame_bound = (500, 700),
    #    frame_bound = (700,  1_000),
    #    frame_bound = (7_000, 15_000),
)


# My Work Environment Discrete Action Space

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
from copy import deepcopy
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
from enum import Enum
from sklearn.preprocessing import StandardScaler, PowerTransformer
import talib

class TradingEnvAction(Enum):
    NEUTRAL = 0
    BUY = 1
    SELL = 2

class TradingPositions(Enum):
    NEUTRAL = 0
    LONG = 1
    SHORT = 2
    

class MyForexEnv(gym.Env):
    metadata = {"render_modes": ["human"]}
    
    def __init__(self,  df, window_size, frame_bound):
        assert len(frame_bound) == 2
        super().__init__()
        
        self.frame_bound = frame_bound
        # self.seed()
        self.df = df
        self.window_size = window_size
        self.prices, self.signal_features = self._process_data()
        self.shape = (window_size, self.signal_features.shape[1])

        # spaces
        # Inf should be a large enough upper bound
        INF = 1e9
        self.action_space = spaces.Discrete(len(TradingEnvAction))
        self.observation_space = spaces.Box(low=-INF, 
                                            high=INF,
                                            shape=self.shape, 
                                            dtype=np.float64)
        
        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._neutral_counter = 0
        
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        self._done = None
        
        self._position = None
        self._position_history = None
        self._total_reward = None
        self._total_profit = None
        
        self._first_rendering = None
        
        self.trade_history = []
        self.trade_fee = 0.0003 
        self._seed()
        
        # self.step([0, 0, 13, 13])
        # self.step([1,  0, 10, 14])
    #    print("self._position_history:", self._position_history)
        # self.reset()
        # self.step(0)
        # self.step(0)
        # self.step(1)
        # self.step(1)
        # self.step(1)
        # self.step(0)
   
    def _process_data(self):
        # prices = self.df.loc[:, 'Close'].to_numpy()
        sma = self.df.loc[:, 'SMA'].to_numpy()
        start, end = self.frame_bound
        sma = sma[start-self.window_size:end]
        diff = np.diff(sma, prepend=0)
        signal_features = np.column_stack((sma, diff))
        return sma, signal_features
    
    def _get_observation(self):
        return self.signal_features[(self._current_tick-self.window_size):self._current_tick]

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]     
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # print("\nreset")
        self._done = False
        self._total_reward = 0.0
        self._total_profit = 1.0
        self._first_rendering = True
        self._position = TradingEnvAction.NEUTRAL.value
        self._position_history = []
        self.history = {}
        observation = self._get_observation()
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )
        return observation, info


    def step(self, action):
        print("action:", action)
        self._done = False
        
        step_reward, episode_ended = self._calculate_reward(action)
        self._total_reward += step_reward
        
        # has_traded = self._update_profit(action)
      
        
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._total_profit,
        )
        observation = self._get_observation()
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
            self._current_tick = self._start_tick
            self._neutral_counter = 0
            
            
        return observation, step_reward, episode_ended, self._done, info


    def _calculate_reward1(self, action):
        # Append the current action to the position history
        self._position_history.append(action)

        # Define some constants
        NEUTRAL = 0 # Action and state of having no position
        BUY = 1 # Action to open a long position
        SELL = 2 # Action to open a short position
        ALPHA = 0.01 # A small positive constant to avoid taking log of zero

        # Get the current and previous prices
        current_price = self.prices[self._current_tick]
        
        # Check if self._current_tick is 0
        if self._current_tick == 0:
            # Assign a default value to previous_price, such as 1
            previous_price = 1
            # Alternatively, assign a zero reward for the first time step
            # reward = 0.
            # return float(reward), episode_ended
        else:
            # Get the previous price from the prices array
            previous_price = self.prices[self._current_tick - 1]

        # Calculate the price change ratio
        ratio = current_price / previous_price

        # Initialize the reward to zero
        reward = 0.
        episode_ended = False

        # Get the previous action and state
        previous_action = self._position_history[-2] if len(self._position_history) > 1 else TradingEnvAction.NEUTRAL.value
        previous_state = self._position

        # Reward for staying neutral
        if action == NEUTRAL and previous_action == NEUTRAL:
            reward += 0.01

        # Reward for opening a long or short position
        if action == BUY and previous_action == NEUTRAL:
            reward += np.log(ratio + ALPHA)
            self._position = BUY # Update the state to long
            self._entry_price = current_price # Record the entry price
        if action == SELL and previous_action == NEUTRAL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = SELL # Update the state to short
            self._entry_price = current_price # Record the entry price

        # Reward for maintaining a long or short position
        if action == BUY and previous_action == BUY:
            reward += np.log(ratio + ALPHA)
            self._position = BUY # Update the state to long (no change)
        if action == SELL and previous_action == SELL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = SELL # Update the state to short (no change)

        # Reward for closing a long or short position
        if action == NEUTRAL and previous_action == BUY:
            reward += np.log(ratio + ALPHA)
            self._position = NEUTRAL # Update the state to neutral
            if current_price > self._entry_price: # Check if the trade was profitable
                reward += np.log(ratio / (self._entry_price / previous_price) + ALPHA) # Give additional reward for profit
            else:
                reward -= np.log(ratio / (self._entry_price / previous_price) + ALPHA) # Give negative reward for loss
            episode_ended = True # End the episode after closing the trade
        if action == NEUTRAL and previous_action == SELL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = NEUTRAL # Update the state to neutral
            if current_price < self._entry_price: # Check if the trade was profitable
                reward += np.log((self._entry_price / previous_price) / ratio + ALPHA) # Give additional reward for profit
            else:
                reward -= np.log((self._entry_price / previous_price) / ratio + ALPHA) # Give negative reward for loss
            episode_ended = True # End the episode after closing the trade

        return float(reward), episode_ended 

    def _calculate_reward(self, action):
        # Append the current action to the position history
        self._position_history.append(action)

        # Define some constants
        NEUTRAL = 0 # Action and state of having no position
        BUY = 1 # Action to open a long position
        SELL = 2 # Action to open a short position
        ALPHA = 0.01 # A small positive constant to avoid taking log of zero
        MAX_TRADE_STEPS = 10 # A maximum number of time steps that the agent can stay in trade

        # Get the current and previous prices
        current_price = self.prices[self._current_tick]
        
        # Check if self._current_tick is 0
        if self._current_tick == 0:
            # Assign a default value to previous_price, such as 1
            previous_price = 1
            # Alternatively, assign a zero reward for the first time step
            # reward = 0.
            # return float(reward), episode_ended
        else:
            # Get the previous price from the prices array
            previous_price = self.prices[self._current_tick - 1]

        # Calculate the price change ratio
        ratio = current_price / previous_price

        # Initialize the reward to zero
        reward = 0.
        episode_ended = False

        # Get the previous action and state
        previous_action = self._position_history[-2] if len(self._position_history) > 1 else TradingEnvAction.NEUTRAL.value
        previous_state = self._position

        # Reward for staying neutral
        if action == NEUTRAL and previous_action == NEUTRAL:
            reward += 0.01

            # Penalize for giving too many 000s
            self._neutral_counter += 1 # Increment the counter for consecutive 000s
            reward -= 0.001 * self._neutral_counter # Subtract a small amount from the reward for each 000
        else:
            self._neutral_counter = 0 # Reset the counter when the agent takes a different action

        # Reward for opening a long or short position
        if action == BUY and previous_action == NEUTRAL:
            reward += np.log(ratio + ALPHA)
            self._position = BUY # Update the state to long
            self._entry_price = current_price # Record the entry price
            self._trade_counter = 1 # Initialize the counter for consecutive trade steps
        if action == SELL and previous_action == NEUTRAL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = SELL # Update the state to short
            self._entry_price = current_price # Record the entry price
            self._trade_counter = 1 # Initialize the counter for consecutive trade steps

        # Reward for maintaining a long or short position
        if action == BUY and previous_action == BUY:
            reward += np.log(ratio + ALPHA)
            self._position = BUY # Update the state to long (no change)
            
            # Penalize for staying in trade for too long
            self._trade_counter += 1 # Increment the counter for consecutive trade steps
            reward -= 0.001 * self._trade_counter # Subtract a small amount from the reward for each trade step
            
            # Check if the agent has exceeded the maximum number of trade steps
            if self._trade_counter > MAX_TRADE_STEPS:
                reward -= 0.5 # Give a large negative reward for exceeding the limit
                episode_ended = True # End the episode after exceeding the limit

        if action == SELL and previous_action == SELL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = SELL # Update the state to short (no change)

            # Penalize for staying in trade for too long
            self._trade_counter += 1 # Increment the counter for consecutive trade steps
            reward -= 0.001 * self._trade_counter # Subtract a small amount from the reward for each trade step
            
            # Check if the agent has exceeded the maximum number of trade steps
            if self._trade_counter > MAX_TRADE_STEPS:
                reward -= 0.5 # Give a large negative reward for exceeding the limit
                episode_ended = True # End the episode after exceeding the limit

        # Reward for closing a long or short position
        if action == NEUTRAL and previous_action == BUY:
            reward += np.log(ratio + ALPHA)
            self._position = NEUTRAL # Update the state to neutral
            if current_price > self._entry_price: # Check if the trade was profitable
                reward += np.log(ratio / (self._entry_price / previous_price) + ALPHA) # Give additional reward for profit
            else:
                reward -= np.log(ratio / (self._entry_price / previous_price) + ALPHA) # Give negative reward for loss
            episode_ended = True # End the episode after closing the trade
            self._trade_counter = 0 # Reset the counter when the agent closes the trade
        if action == NEUTRAL and previous_action == SELL:
            reward += np.log(1 / ratio + ALPHA)
            self._position = NEUTRAL # Update the state to neutral
            if current_price < self._entry_price: # Check if the trade was profitable
                reward += np.log((self._entry_price / previous_price) / ratio + ALPHA) # Give additional reward for profit
            else:
                reward -= np.log((self._entry_price / previous_price) / ratio + ALPHA) # Give negative reward for loss
            episode_ended = True # End the episode after closing the trade
            self._trade_counter = 0 # Reset the counter when the agent closes the trade

        # Penalize for switching from a long to a short position, or vice versa, without closing the trade first
        if action == SELL and previous_action == BUY:
            reward -= 0.5 # Give a negative reward for switching positions
            self._position = SELL # Update the state to short
            self._entry_price = current_price # Record the new entry price
            self._trade_counter = 1 # Initialize the counter for consecutive trade steps
        if action == BUY and previous_action == SELL:
            reward -= 0.5 # Give a negative reward for switching positions
            self._position = BUY # Update the state to long
            self._entry_price = current_price # Record the new entry price
            self._trade_counter = 1 # Initialize the counter for consecutive trade steps

        return float(reward), episode_ended 


    
env = MyForexEnv(
       df = deepcopy(df),
       window_size = 500,
       frame_bound = (500, 700),
    #    frame_bound = (700,  1_000),
    #    frame_bound = (7_000, 15_000),
)


In [None]:
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
import torch as th


# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=10_000,
  save_path="./logs/",
  name_prefix="trade_rl_model",
  save_replay_buffer=True,
  save_vecnormalize=True,
)

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[128, 128], vf=[128, 128]))

# policy_kwargs = dict(net_arch=[ dict(vf=[128, 128], pi=[64, 64])])

# policy_kwargs = dict( net_arch=dict(pi=[32, 32], vf=[32, 32]))
model = RecurrentPPO('MlpLstmPolicy', env, 
                    #  verbose=1,  
                    #  n_steps=256, 
                    #  gamma=0.95, 
                    #  n_epochs=500, 
                    #  target_kl=0.001, 
                    #  learning_rate=0.01,
                     tensorboard_log="./trade_env_tensorboard/")
model.learn(total_timesteps=1_000_000,  
            log_interval=10, 
            tb_log_name="trade_name",
            callback=checkpoint_callback,
            )

# model = PPO('MlpPolicy', env, verbose=1)
# model.learn(total_timesteps=1_000_000)

In [None]:
from stable_baselines3 import PPO

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1_000_000)

In [None]:
from sb3_contrib import RecurrentPPO

# Load the model from a zip file
model = RecurrentPPO.load("/Users/newuser/Projects/robust-algo-trader/envs/logs/trade_rl_model_1000000_steps.zip", env=env)

model.learn(total_timesteps=10_000_000,  
            log_interval=10, 
            tb_log_name="trade_name",
            callback=checkpoint_callback,
            )

In [None]:

import matplotlib.pyplot as plt

observation, info = env.reset(seed=0)
episode_count = 0

while True:
    action, _state = model.predict(observation)
    # action = env.action_space.sample()
    # print(action)
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated:
        episode_count += 1
    # observation, reward, done, info = env.step(action)
    # env.render()
    if truncated:
        print("info:", info)
        break
print("episode_count:", episode_count)
# plt.cla()
# env.render_all()
# plt.show()

# EUR/USD Dataset Viz

In [None]:
import matplotlib.pyplot as plt
df['SMA'].plot()

In [None]:
df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib

real_df = pd.read_table('/Users/newuser/Projects/robust-algo-trader/data/EURUSD/EURUSD_H1_200702210000_202304242100.tsv')

df = real_df.copy()
# take only last 7000 rows
df = df.iloc[:1_000]
# df = df.iloc[-7_00:]

# remove the following columns <TICKVOL>, <VOL> and <SPREAD>
df = df.drop(['<TICKVOL>', '<VOL>', '<SPREAD>'], axis=1)
# rename the columns
df = df.rename(columns={'<DATE>': 'Date', 
                                '<TIME>': 'Time', 
                                '<OPEN>': 'Open', 
                                '<HIGH>': 'High', 
                                '<LOW>': 'Low', 
                                '<CLOSE>': 'Close'
                                })
# combine the date and time columns
df['Date_Time'] = df['Date'] + ' ' + df['Time']
# convert the date_time column to datetime
df['Date_Time'] = pd.to_datetime(df['Date_Time'], format='%Y%m%d %H:%M:%S.%f')
# remove the date and time columns
df = df.drop(['Date', 'Time'], axis=1)
# Rename Date_Time to Time
df = df.rename(columns={'Date_Time': 'Time'})
df.index = df['Time']
# remove the Time column
df = df.drop(['Time'], axis=1)


prices = df["Close"].values
df["SMA"] = talib.SMA(prices, timeperiod=200)
df["EMA"] = talib.EMA(prices, timeperiod=200)
# df['TEMA'] = talib.TRIMA(prices, timeperiod=200*2)

df['ATR'] = talib.NATR(df['High'], df['Low'], df['Close'], timeperiod=200)
# df["EMA"] = ema
df = df.dropna()

In [None]:
df

In [None]:
df = df.iloc[-7_0:]

In [None]:
df['Close'].plot()
df['SMA'].plot()

# df['EMA'].plot()


In [None]:
df['SMA'].plot()

# add a new column namely smoothed SMA data
# df['smoothed_sma'] = df['SMA'].rolling(window=300).mean()
# df['smoothed_sma'].plot()


In [None]:
df['Close'].plot()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the x-coordinates
x = np.linspace(0, 2*np.pi, 100)

# Evaluate the function y = sin(x)
y = np.sin(x)

# Calculate the gradient using numpy.gradient
grad = np.gradient(y, x)

# Find the indices where the gradient is close to zero (within a tolerance)
tol = 0.01 # You can adjust this value as needed
zero_grad_indices = np.where(np.abs(grad) < tol)

# Plot the curve and its gradient
plt.plot(x, y, label='y = sin(x)')
plt.plot(x, grad, label="y' = cos(x)")

# Plot the dots on where the gradient is close to zero
plt.plot(x[zero_grad_indices], y[zero_grad_indices], 'ro', label='zero gradient points')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer

scaler = StandardScaler()
copy_df = df.copy()

# scale all columns in copy_df 
copy_df['SMA'] = scaler.fit_transform(copy_df['SMA'].values.reshape(-1, 1))




In [None]:
copy_df

In [None]:
copy_df['SMA'].plot()


In [None]:
import numpy as np
import pandas as pd

# Define the parameters of the sine wave
frequency = 0.2 # cycles per day
amplitude = 0.3 # peak value
phase = 0 # initial angle
offset = 1 # vertical shift

# Define the time range and interval
start_time = "2022-01-01 00:00:00" # start date and time
end_time = "2023-01-01 00:00:00" # start date and time
# end_time = "2023-01-31 23:00:00" # end date and time
time_interval = "1H" # hourly interval

# Create a time index using pandas date_range function
time_index = pd.date_range(start_time, end_time, freq=time_interval)

# Convert the time index to radians using numpy pi and timedelta function
time_radians = 2 * np.pi * frequency * (time_index - time_index[0]) / pd.Timedelta(days=1) + phase

# Calculate the sine values using numpy sin function and the parameters
sine_values = amplitude * np.sin(time_radians) + offset

# Create a dataframe using pandas DataFrame function and the time index and sine values
df = pd.DataFrame({"Time": time_index, "SMA": sine_values})

# Set the Time column as the index column using pandas set_index function
df = df.set_index("Time")

df['SMA'].plot()


In [None]:
df['SMA'].plot()

In [None]:
sma = df.loc[:, 'SMA'].to_numpy()
sma

In [None]:
sma = df.loc[:, 'SMA'].to_numpy()
frame_bound = (10, 500)
window_size = 10
sma = sma[frame_bound[0] - window_size:frame_bound[1]]
diff = np.insert(np.diff(sma), 0, 0)
signal_features = np.column_stack((sma, diff))

In [None]:
len(signal_features)

In [None]:
# Use iloc instead of loc to access the SMA column by index
sma = df.loc[:, 'SMA'].to_numpy()
# Unpack the frame_bound tuple into two variables
start, end = (10, 500)
window_size = 10
# Use negative indexing to avoid computing the start index of the slice
sma = sma[-window_size + start:end]
# Use np.diff with prepend option instead of np.insert
diff = np.diff(sma, prepend=0)
# Use np.c_ instead of np.column_stack for readability
signal_features = np.column_stack((sma, diff))

In [None]:
signal_features_df = pd.DataFrame(signal_features, columns=["SMA", "diff"])

In [None]:
index = 0
window_size = 10
signal_features_0_to_100 = signal_features[(index):(index + window_size)]
signal_features_0_to_100

In [None]:
signal_features_df

window = 10
index = window_size + 0

filtered_df = signal_features_df.iloc[index-window:window-1:-1]

# filtered_df = signal_features_df.iloc[index:window_size:-1]

filtered_df

In [None]:
_current_tick = 0
observation = signal_features[(_current_tick-window_size+1):_current_tick+1]
observation

In [None]:
observation = signal_features[-window_size:1]
observation

In [None]:
import torch
import treetensor.torch as ttorch

In [None]:
from cmath import inf
from typing import Any, List
from easydict import EasyDict
from abc import abstractmethod
from collections import namedtuple
from enum import Enum
from pprint import pprint
from copy import deepcopy
import os
import copy
import pandas as pd
import numpy as np

import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
import torch
import numbers




def to_ndarray(item: Any, dtype: np.dtype = None):
    def transform(d):
        if dtype is None:
            return np.array(d)
        else:
            return np.array(d, dtype=dtype)

    if isinstance(item, dict):
        new_data = {}
        for k, v in item.items():
            new_data[k] = to_ndarray(v, dtype)
        return new_data
    elif isinstance(item, list) or isinstance(item, tuple):
        if len(item) == 0:
            return None
        elif isinstance(item[0], numbers.Integral) or isinstance(item[0], numbers.Real):
            return transform(item)
        elif hasattr(item, '_fields'):  # namedtuple
            return type(item)(*[to_ndarray(t, dtype) for t in item])
        else:
            new_data = []
            for t in item:
                new_data.append(to_ndarray(t, dtype))
            return new_data
    elif isinstance(item, torch.Tensor):
        if dtype is None:
            return item.numpy()
        else:
            return item.numpy().astype(dtype)
    elif isinstance(item, np.ndarray):
        if dtype is None:
            return item
        else:
            return item.astype(dtype)
    elif isinstance(item, bool) or isinstance(item, str):
        return item
    elif np.isscalar(item):
        return np.array(item)
    elif item is None:
        return None
    else:
        raise TypeError("not support item type: {}".format(type(item)))


def load_dataset(name, index_name):
    df = pd.read_csv(name, parse_dates=True, index_col=index_name)
    return df

class Actions(int, Enum):
    DOUBLE_SELL = 0
    SELL = 1
    HOLD = 2
    BUY = 3
    DOUBLE_BUY = 4

class Positions(int, Enum):
    SHORT = -1.
    FLAT = 0.
    LONG = 1.

def transform(position, action):
    if action == Actions.SELL:
        if position == Positions.LONG:
            return Positions.FLAT, False

        if position == Positions.FLAT:
            return Positions.SHORT, True

    if action == Actions.BUY:
        if position == Positions.SHORT:
            return Positions.FLAT, False

        if position == Positions.FLAT:
            return Positions.LONG, True

    if action == Actions.DOUBLE_SELL and (position == Positions.LONG or position == Positions.FLAT):
        return Positions.SHORT, True

    if action == Actions.DOUBLE_BUY and (position == Positions.SHORT or position == Positions.FLAT):
        return Positions.LONG, True

    return position, False


class TradingEnv(gym.Env):

    def __init__(self, cfg):

        self._cfg = cfg
        self._env_id = cfg.env_id
        #======== param to plot =========
        self.cnt = 0

        if 'plot_freq' not in self._cfg:
            self.plot_freq = 10
        else:
            self.plot_freq = self._cfg.plot_freq
        if 'save_path' not in self._cfg:
            self.save_path = './'
        else:
            self.save_path = self._cfg.save_path

        #================================
        self.train_range = cfg.train_range
        self.test_range = cfg.test_range
        self.window_size = cfg.window_size
        self.prices = None
        self.signal_features = None
        self.feature_dim_len = None
        self.shape = (cfg.window_size, 3)

        #======== param about episode =========
        # self._end_tick = 0
        self._current_tick = None
        self._done = None
        self._last_trade_tick = None
        self._position = None
        self._position_history = None
        self._total_reward = None
        #======================================
        self._init_flag = True
        
        # ====== load Google stocks data =======
        raw_data = load_dataset(self._cfg.stocks_data_filename, 'Date')
        self.raw_prices = raw_data.loc[:, 'Close'].to_numpy()
        self.df = deepcopy(raw_data)
        
        EPS = 1e-10
        if self.train_range == None or self.test_range == None:
            self.df = self.df.apply(lambda x: (x - x.mean()) / (x.std() + EPS), axis=0)
        else:
            boundary = int(len(self.df) * self.train_range)
            train_data = raw_data[:boundary].copy()
            boundary = int(len(raw_data) * (1 + self.test_range))
            test_data = raw_data[boundary:].copy()

            train_data = train_data.apply(lambda x: (x - x.mean()) / (x.std() + EPS), axis=0)
            test_data = test_data.apply(lambda x: (x - x.mean()) / (x.std() + EPS), axis=0)
            self.df.loc[train_data.index, train_data.columns] = train_data
            self.df.loc[test_data.index, test_data.columns] = test_data
        # ======================================
        # set cost
        self.trade_fee_bid_percent = 0.01  # unit
        self.trade_fee_ask_percent = 0.005  # unit
        
        # self.prices, self.signal_features, self.feature_dim_len = self._process_data()
        start_idx = 0
        self.prices, self.signal_features, self.feature_dim_len = self._process_data()
        self._start_tick = cfg.window_size
        self._end_tick = len(self.prices) - 1
        self.shape = (self.window_size, self.feature_dim_len)
        self._action_space = spaces.Discrete(len(Actions))
        self._observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float64)
        self._reward_space = spaces.Box(-inf, inf, shape=(1, ), dtype=np.float32)
        self._done = False
        self._seed()
        
        # self.reset()
        # self.step(0)
        # self.step(0)
        # self.step(0)
        # self.step(1)
        # self.step(1)
        # self.step(1)
        # self.step(0)
    
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]          

    # def seed(self, seed, dynamic_seed):
    #     self._seed = seed
    #     self._dynamic_seed = dynamic_seed
    #     np.random.seed(self._seed)
    #     self.np_random, seed = seeding.np_random(seed)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        print("\nreset")
        self.cnt += 1
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1
        self._position = Positions.FLAT
        self._position_history = [self._position]
        self._profit_history = [1.]
        self._total_reward = 0.
        
        info = dict(
            total_reward = self._total_reward,
            total_profit = self._profit_history,
        )
        obs = self._get_observation()

        return obs, info

    def random_action(self):
        return np.array([self.action_space.sample()])

    def step(self, action):
        print('step')
        print('action', action)
        # assert isinstance(action, np.ndarray), type(action)
        # if action.shape == (1, ):
        #     action = action.item()  # 0-dim array

        self._done = False
        self._current_tick += 1

        if self._current_tick >= self._end_tick:
            self._done = True

        step_reward = self._calculate_reward(action)
        self._total_reward += step_reward
        self._position, trade = transform(self._position, action)

        if trade:
            self._last_trade_tick = self._current_tick
            print('trade')

        self._position_history.append(self._position)
        self._profit_history.append(float(np.exp(self._total_reward)))
        observation = self._get_observation()
        info = dict(
            total_reward=self._total_reward,
            position=self._position.value,
        )

        if self._done:
            if self._env_id[-1] == 'e' and self.cnt % self.plot_freq == 0:
                self.render()
            info['max_possible_profit'] = np.log(self.max_possible_profit())
            info['eval_episode_return'] = self._total_reward

        # step_reward = to_ndarray([step_reward]).astype(np.float32)
        print('done', self._done)
        return observation, step_reward,  False, self._done, info
        # return BaseEnvTimestep(observation, step_reward, self._done, info)

    def _get_observation(self):
        # sig_features = to_ndarray(
        #                  ).reshape(-1).astype(np.float32)
        sig_features = self.signal_features[(self._current_tick - self.window_size+1):self._current_tick + 1]

        # tick = (self._current_tick - self._last_trade_tick) / self._cfg.eps_length
        # obs = np.hstack([sig_features, to_ndarray([self._position.value]), to_ndarray([tick])])
        return sig_features

    # override
    def _process_data(self, start_idx = None):
        all_feature_name = ['Close', 'Open', 'High', 'Low', 'Adj Close', 'Volume']
        all_feature = {k: self.df.loc[:, k].to_numpy() for k in all_feature_name}
        # add feature "Diff"
        prices = self.df.loc[:, 'Close'].to_numpy()
        diff = np.insert(np.diff(prices), 0, 0)
        all_feature_name.append('Diff')
        all_feature['Diff'] = diff
        # =================================

        # you can select features you want
        selected_feature_name = ['Close', 'Diff', 'Volume']
        selected_feature = np.column_stack([all_feature[k] for k in selected_feature_name])
        feature_dim_len = len(selected_feature_name)

        # validate index
        if start_idx is None:
            if self.train_range == None or self.test_range == None:
                self.start_idx = np.random.randint(self.window_size, len(self.df) - self._cfg.eps_length)
            elif self._env_id[-1] == 'e':
                boundary = int(len(self.df) * (1 + self.test_range))
                assert len(self.df) - self._cfg.eps_length > boundary + self.window_size,\
                 "parameter test_range is too large!"
                self.start_idx = np.random.randint(boundary + self.window_size, len(self.df) - self._cfg.eps_length)
            else:
                boundary = int(len(self.df) * self.train_range)
                assert boundary - self._cfg.eps_length > self.window_size,\
                 "parameter test_range is too small!"
                self.start_idx = np.random.randint(self.window_size, boundary - self._cfg.eps_length)
        else:
            self.start_idx = start_idx

        self._start_tick = self.start_idx
        self._end_tick = self._start_tick + self._cfg.eps_length - 1

        return prices, selected_feature, feature_dim_len
    
   
    # override
    def _calculate_reward(self, action):
        step_reward = 0.
        current_price = (self.raw_prices[self._current_tick])
        last_trade_price = (self.raw_prices[self._last_trade_tick])
        ratio = current_price / last_trade_price
        cost = np.log((1 - self.trade_fee_ask_percent) * (1 - self.trade_fee_bid_percent))

        if action == Actions.BUY and self._position == Positions.SHORT:
            step_reward = np.log(2 - ratio) + cost

        if action == Actions.SELL and self._position == Positions.LONG:
            step_reward = np.log(ratio) + cost

        if action == Actions.DOUBLE_SELL and self._position == Positions.LONG:
            step_reward = np.log(ratio) + cost

        if action == Actions.DOUBLE_BUY and self._position == Positions.SHORT:
            step_reward = np.log(2 - ratio) + cost

        step_reward = float(step_reward)
        return step_reward 

    # def render(self):
    #     import matplotlib.pyplot as plt
    #     plt.clf()
    #     plt.xlabel('trading days')
    #     plt.ylabel('profit')
    #     plt.plot(self._profit_history)
    #     plt.savefig(self.save_path + str(self._env_id) + "-profit.png")

    #     plt.clf()
    #     plt.xlabel('trading days')
    #     plt.ylabel('close price')
    #     window_ticks = np.arange(len(self._position_history))
    #     eps_price = self.raw_prices[self._start_tick:self._end_tick + 1]
    #     plt.plot(eps_price)

    #     short_ticks = []
    #     long_ticks = []
    #     flat_ticks = []
    #     for i, tick in enumerate(window_ticks):
    #         if self._position_history[i] == Positions.SHORT:
    #             short_ticks.append(tick)
    #         elif self._position_history[i] == Positions.LONG:
    #             long_ticks.append(tick)
    #         else:
    #             flat_ticks.append(tick)

    #     plt.plot(long_ticks, eps_price[long_ticks], 'g^', markersize=3, label="Long")
    #     plt.plot(flat_ticks, eps_price[flat_ticks], 'bo', markersize=3, label="Flat")
    #     plt.plot(short_ticks, eps_price[short_ticks], 'rv', markersize=3, label="Short")
    #     plt.legend(loc='upper left', bbox_to_anchor=(0.05, 0.95))
    #     plt.savefig(self.save_path + str(self._env_id) + '-price.png')

    # def close(self):
    #     import matplotlib.pyplot as plt
    #     plt.close()

    # # override
    # def create_collector_env_cfg(cfgs):
    #     """
    #     Overview:
    #         Return a list of all of the environment from input config, used in env manager \
    #         (a series of vectorized env), and this method is mainly responsible for envs collecting data.
    #         In TradingEnv, this method will rename every env_id and generate different config.
    #     Arguments:
    #         - cfg (:obj:`dict`): Original input env config, which needs to be transformed into the type of creating \
    #             env instance actually and generated the corresponding number of configurations.
    #     Returns:
    #         - env_cfg_list (:obj:`List[dict]`): List of ``cfg`` including all the config collector envs.
    #     .. note::
    #         Elements(env config) in collector_env_cfg/evaluator_env_cfg can be different, such as server ip and port.
    #     """
    #     collector_env_num = cfg.pop('collector_env_num')
    #     collector_env_cfg = [copy.deepcopy(cfg) for _ in range(collector_env_num)]
    #     for i in range(collector_env_num):
    #         collector_env_cfg[i]['env_id'] += ('-' + str(i) + 'e')
    #     return collector_env_cfg

    # # override
    # def create_evaluator_env_cfg(cfg):
    #     """
    #     Overview:
    #         Return a list of all of the environment from input config, used in env manager \
    #         (a series of vectorized env), and this method is mainly responsible for envs evaluating performance.
    #         In TradingEnv, this method will rename every env_id and generate different config.
    #     Arguments:
    #         - cfg (:obj:`dict`): Original input env config, which needs to be transformed into the type of creating \
    #             env instance actually and generated the corresponding number of configurations.
    #     Returns:
    #         - env_cfg_list (:obj:`List[dict]`): List of ``cfg`` including all the config evaluator envs.
    #     """
    #     evaluator_env_num = cfg.pop('evaluator_env_num')
    #     evaluator_env_cfg = [copy.deepcopy(cfg) for _ in range(evaluator_env_num)]
    #     for i in range(evaluator_env_num):
    #         evaluator_env_cfg[i]['env_id'] += ('-' + str(i) + 'e')
    #     return evaluator_env_cfg

    # # override
    # def max_possible_profit(self):
    #     current_tick = self._start_tick
    #     last_trade_tick = current_tick - 1
    #     profit = 1.

    #     while current_tick <= self._end_tick:
    #         if self.raw_prices[current_tick] < self.raw_prices[current_tick - 1]:
    #             while (current_tick <= self._end_tick
    #                    and self.raw_prices[current_tick] < self.raw_prices[current_tick - 1]):
    #                 current_tick += 1

    #             current_price = self.raw_prices[current_tick - 1]
    #             last_trade_price = self.raw_prices[last_trade_tick]
    #             tmp_profit = profit * (2 - (current_price / last_trade_price)) * (1 - self.trade_fee_ask_percent
    #                                                                               ) * (1 - self.trade_fee_bid_percent)
    #             profit = max(profit, tmp_profit)
    #         else:
    #             while (current_tick <= self._end_tick
    #                    and self.raw_prices[current_tick] >= self.raw_prices[current_tick - 1]):
    #                 current_tick += 1

    #             current_price = self.raw_prices[current_tick - 1]
    #             last_trade_price = self.raw_prices[last_trade_tick]
    #             tmp_profit = profit * (current_price / last_trade_price) * (1 - self.trade_fee_ask_percent
    #                                                                         ) * (1 - self.trade_fee_bid_percent)
    #             profit = max(profit, tmp_profit)
    #         last_trade_tick = current_tick - 1
    #     return profit

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def action_space(self):
        return self._action_space

    # @property
    # def reward_space(self):
    #     return self._reward_space

    # def __repr__(self):
    #     return "Trading Env"
    


from easydict import EasyDict

stocks_dqn_config = dict(
    exp_name='stocks_dqn_seed0',
    env=dict(
        # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess'
        # Env number respectively for collector and evaluator.
        collector_env_num=8,
        evaluator_env_num=8,
        env_id='stocks-v0',
        n_evaluator_episode=8,
        stop_value=2,
        # one trading year.
        eps_length=253,
        # associated with the feature length.
        window_size=20,
        # the path to save result image.
        save_path='./fig/',
        # the raw data file name
        # stocks_data_filename='STOCKS_GOOGL',
        stocks_data_filename='/Users/newuser/Projects/robust-algo-trader/data/STOCKS_GOOGL.csv',
        # the stocks range percentage used by train/test.
        # if one of them is None, train & test set will use all data by default.
        train_range=None,
        test_range=None,
    ),
    policy=dict(
        # Whether to use cuda for network.
        cuda=True,
        model=dict(
            obs_shape=62,
            action_shape=5,
            encoder_hidden_size_list=[128],
            head_layer_num=1,
            # Whether to use dueling head.
            dueling=True,
        ),
        # Reward's future discount factor, aka. gamma.
        discount_factor=0.99,
        # How many steps in td error.
        nstep=5,
        # learn_mode config
        learn=dict(
            update_per_collect=10,
            batch_size=64,
            learning_rate=0.001,
            # Frequency of target network update.
            target_update_freq=100,
            ignore_done=True,
        ),
        # collect_mode config
        collect=dict(
            # You can use either "n_sample" or "n_episode" in collector.collect.
            # Get "n_sample" samples per collect.
            n_sample=64,
            # Cut trajectories into pieces with length "unroll_len".
            unroll_len=1,
        ),
        # command_mode config
        other=dict(
            # Epsilon greedy with decay.
            eps=dict(
                # Decay type. Support ['exp', 'linear'].
                type='exp',
                start=0.95,
                end=0.1,
                decay=50000,
            ),
            replay_buffer=dict(replay_buffer_size=100000, )
        ),
    ),
)
main_config = EasyDict(stocks_dqn_config)
env = TradingEnv(main_config.env)

# from stable_baselines3 import PPO

# model = PPO("MlpPolicy", env, verbose=1)
# model.learn(total_timesteps=1_000_000)
# from sb3_contrib import RecurrentPPO

# model = RecurrentPPO('MlpLstmPolicy', env, 
#                      tensorboard_log="./di_engine_trade_env_tensorboard/")

# model.learn(total_timesteps=1_000_000,  
#             log_interval=10, 
#             tb_log_name="di_engine_trade",
#             )

In [None]:
from sb3_contrib import RecurrentPPO
from stable_baselines3 import PPO


model = RecurrentPPO('MlpLstmPolicy', env, 
                     tensorboard_log="./di_engine_trade_env_tensorboard/")

# model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1_000_000,  
            log_interval=10, 
            tb_log_name="di_engine_trade",
            )