## Custom Gymnasium Enviroment

Import whats needed

In [None]:
import numpy as np
import pandas as pd

In [None]:
# here we convert the jupiter file to a python script since it is easier to handle 
!jupyter nbconvert --to script --output rl_environment rl_environment.ipynb

In [None]:
from supporting_classes import TradingEnv, OHLCScaler
import joblib

# Load raw data
data = pd.read_csv('../data/Candlestick_01nov1999_28oct2025.csv', parse_dates=['date'], index_col='date')

# Split into train/test
split_idx  = int(len(data) * 0.8)
train_data = data.iloc[:split_idx]
test_data  = data.iloc[split_idx:]

# Fit OHLCScaler on training data
scaler = OHLCScaler(train_data)

# Transform train and test data
scaled_train_df = scaler.transform(train_data)
scaled_test_df  = scaler.transform(test_data)

# Save scaler for future use
joblib.dump(scaler, '../models/scaler.pkl')

Understand the Data better

In [None]:
# let's have a look at the original data
print(data.head()) # print the first few rows of the original data
print("...")
print(data.tail())  # print the last few rows of the original data

In [None]:
# let's have a look at the scaled training data
print(scaled_train_df.head()) # print the first few rows of the scaled training data
print("...")
print(scaled_test_df.tail())  # print the last few rows of the scaled test data

In [None]:
# view both datasets as candle-stick charts, lets say one month each
import matplotlib.pyplot as plt
import mplfinance as mpf

FIG_SCALE = 0.5

plot_data_org = data.loc['2000-03-01':'2000-03-31']
plot_data_nor = scaled_train_df.loc['2000-03-01':'2000-03-31']

mpf.plot(plot_data_org, type='candle', volume=False, title='Scaled Original Data', style='yahoo', figscale=FIG_SCALE)
mpf.plot(plot_data_nor, type='candle', volume=False, title='Scaled Normalized Data', style='yahoo', figscale=FIG_SCALE)

# PPO training and evaluation code
Now its time to get the data and train the model

In [None]:
from supporting_classes import TradingEnv
from stable_baselines3 import PPO

# Create environment
env = TradingEnv(scaled_train_df, window_size=50)


# Define PPO model with tuned hyperparameters for trading
model = PPO(
    "MlpPolicy",                # Use a Multi-Layer Perceptron policy
    env,                        # Pass the trading environment
    learning_rate=0.0003,       # Lower learning rate for more stable updates
    n_steps=8192,               # Number of steps per rollout (larger for stability)
    batch_size=256,             # Batch size for gradient updates
    n_epochs=10,                # Number of epochs per update
    gamma=0.99,                 # Discount factor (shorter horizon for trading)
    gae_lambda=0.95,            # GAE parameter for advantage estimation
    clip_range=0.2,             # PPO clipping range
    ent_coef=0.05,              # Encourage exploration
    vf_coef=0.5,                # Weight for value function loss
    max_grad_norm=0.5,          # Gradient clipping for stability
    normalize_advantage=True,   # Helps with stability
    policy_kwargs=dict(net_arch=[256, 256]),  # Larger network for complex patterns
    target_kl=0.03,             # ensures stability even with aggressive updates
    verbose=1                   # Print training logs
)

model.learn(total_timesteps=100_000)        # train the model for 100,000 timesteps

# Save the model

In [None]:
# craete a file name depending on the last trained date
filenPath = f"../models/ppo_trading_{scaled_train_df.index[split_idx - 1].strftime('%Y_%m_%d')}"

print(scaled_train_df.tail()  )

# save the trained model
print(f"Saving model as {filenPath}")
model.save(filenPath)         

# Initial Testing
Now let's test the mode

In [None]:
# Test-Umgebung erstellen
test_env = TradingEnv(scaled_test_df, window_size=50)

# Evaluation
obs, info = test_env.reset()                    # reset the environment to start a new episode
total_reward = 0                                # initialize total reward
steps = 0                                       # initialize step counter

while True:
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = test_env.step(action)
    total_reward += reward
    steps += 1
    if done:
        break

print(f"Total Reward on Test Data: {total_reward:.4f}")
print(f"Steps: {steps}")
