## Custom Gymnasium Enviroment

Import whats needed

In [None]:
import numpy as np
import pandas as pd

In [None]:
# here we convert the jupiter file to a python script since it is easier to handle 
!jupyter nbconvert --to script --output rl_environment rl_environment.ipynb

Trading Enviroment Class

In [None]:
import gymnasium as gym         # Gymnasium is a library for building RL environments
                                # It provides a standard interface so RL algorithms (like PPO from Stable Baselines3) can interact with your environment.
from gymnasium import spaces    # spaces defines the action space and observation space for your environment

class TradingEnv(gym.Env):
    def __init__(self, data: pd.DataFrame, window_size=50):
        super().__init__()                              # initialize the base class
        self.data = data.reset_index(drop=True)         # reset index for easier slicing and store it in self.data
        self.window_size = window_size                  # number of previous candles to include in the observation
        self.current_step = window_size                 # start after the initial window

        # Observation: last N candles (OHLC + Volume)
        self.observation_space = spaces.Box(            # description of the observation space is given
            low=-np.inf, high=np.inf,                   # low and high values for each element in the observation are unbounded / infinite
            shape=(window_size, self.data.shape[1]),    # shape of the observation (window_size rows, number of features columns)
            dtype=np.float32                            # data type of the observation elements
        )
        # Actions: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = spaces.Discrete(3)          # actually, there is not much we can do

    def reset(self, seed=None, options=None):           # reset the environment to initial state
        super().reset(seed=seed)                        # call the base class reset method
        self.current_step = self.window_size            # bring current step back to initial position
        return self._get_observation(), {}              # return the last known observation and an empty info dict
                                                        #   -> dict can be used to pass additional information like debugging info                                               

    def step(self, action):                             # do one step of the training
        reward = self._calculate_reward(action)         # calculate reward based on action taken - in the end the |profit/loss|
        self.current_step += 1                          # move to the next time step
        done = self.current_step >= len(self.data) - 1  # episode is done if we reach the end of the data
        return self._get_observation(), reward, done, False, {} 

    def _get_observation(self):
        obs = self.data.iloc[self.current_step - self.window_size:self.current_step].values # Return last N rows as observation ":" is NOT a division
        return obs.astype(np.float32)                   # return the last <windwow_size> observations as float32 numpy array

    def _calculate_reward(self, action):                # the REWARD is calculated based on the difference in closing prices
        if self.current_step + 1 >= len(self.data):     # check if we are at the end of the data
            print("End of data reached - we should never get here")
            return 0.0
        price_diff = self.data['Close'].iloc[self.current_step + 1] - self.data['Close'].iloc[self.current_step]
        if action == 1:   # BUY <-----------------------
            return price_diff
        elif action == 2: # SELL <----------------------
            return -price_diff
        else:             # HOLD <----------------------
            return 0.0

    
    def render(self, action=None, reward=None):  # render the current state of the environment
        msg = f"Step: {self.current_step}, Close Price: {self.data['Close'].iloc[self.current_step]}"
        if action is not None:  # if an action was taken, include it in the message
            msg += f", Action: {action}"
        if reward is not None:  # if a reward was given, include it in the message
            msg += f", Reward: {reward:.4f}"
        print(msg)



Get the data and normalize it with Z-score

In [None]:

from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd

# Load raw data
data = pd.read_csv('../data/Candlestick_01jan2000_31dec2024.csv', parse_dates=['date'], index_col='date')

# Split into train/test
split_idx  = int(len(data) * 0.8)
train_data = data.iloc[:split_idx]
test_data  = data.iloc[split_idx:]

# Fit scaler on training data only
scaler       = StandardScaler()
scaled_train = scaler.fit_transform(train_data) # train the scaler on train data
scaled_test  = scaler.transform(test_data)      # run the scaler on the rest

joblib.dump(scaler, '../models/scaler.pkl')     # Save scaler for future use

# Convert back to DataFrame
scaled_train_df = pd.DataFrame(scaled_train, columns=data.columns, index=train_data.index)
scaled_test_df  = pd.DataFrame(scaled_test,  columns=data.columns, index=test_data.index )


PPO training and evaluation code

In [None]:
from stable_baselines3 import PPO

# Create environment
env = TradingEnv(scaled_train_df, window_size=50)

# Train PPO agent
model = PPO("MlpPolicy", env, verbose=1)    # create PPO (Proximal Policy Optimization) model with MLP policy
model.learn(total_timesteps=100_000)        # train the model for 100,000 timesteps

model.save("../models/ppo_trading")         # save the trained model (the model is now trained until the NOW)

Now let's test the mode

In [None]:

# Test-Umgebung erstellen
test_env = TradingEnv(scaled_test_df, window_size=50)

# Evaluation
obs, info = test_env.reset()
total_reward = 0
steps = 0

while True:
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = test_env.step(action)
    total_reward += reward
    steps += 1
    if done:
        break

print(f"Total Reward on Test Data: {total_reward:.4f}")
print(f"Steps: {steps}")