In [3]:
import gymnasium as gym
import gym_trading_env
import wandb
import pandas as pd
import numpy as np
from wandb.integration.sb3 import WandbCallback
from sb3_contrib import RecurrentPPO
from utils import reward_function_updated

# --- 2. TRAITEMENT (Inchangé) ---
def calculate_indicators(df):
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0

    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    slow_macd = exp1 - exp2
    fast_macd = slow_macd.ewm(span=9, adjust=False).mean()
    df['feature_macd_histogram'] = slow_macd - fast_macd
    df['feature_macd_fast'] = fast_macd
    df['feature_macd_slow'] = slow_macd

    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']

    df['feature_return'] = df['close'].pct_change()


    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

In [None]:
# Liste des différents hyperparamètres à tester
learning_rates = [1e-4, 1e-3] # Taux d'apprentissage (1e-4 et 1e-3)
ent_coefs = [0.01, 0.05] # Coefficients d'entropie pour encourager l'exploration (0.01 et 0.05)
position_ranges = [(-1,1), (-0.5, 0.5), (0,0.8)] # Différentes plages de positions [-1,1], [-0.5,0.5], [0,0.8]
gammas = [0.9, 0.95, 0.99] # Facteurs d'actualisation (0.9, 0.95, 0.99)
gae_lambdas = [0.01, 0.05, 0.1] # Paramètres GAE (0.01, 0.05, 0.1)
batch_sizes = [128, 256] # Tailles de batch (128 et 256)

# Boucle sur toutes les combinaisons d'hyperparamètres
for lr in learning_rates:
    for ent in ent_coefs:
        for pos_range in position_ranges:
            for gamma in gammas:
                for gae_lambda in gae_lambdas:
                    for batch_size in batch_sizes:
                        config = {
                            "policy_type": "MlpLstmPolicy",
                            "total_timesteps": 200_000,    # On allonge encore, l'Alpha est dur à trouver
                            "learning_rate": lr,
                            "ent_coef": ent,
                            "batch_size": batch_size, # Batch plus gros pour lisser le bruit des returns
                            "n_steps": 2048,
                            "position_range": pos_range,
                            "gae_lambda": gae_lambda,
                            "gamma": gamma,
                            "project_name": "RL-Trading-Project",
                            "run_name": f"{lr}_{ent}_{pos_range}_{gamma}_{gae_lambda}_{batch_size}",
                        }
                        run = wandb.init(
                            project=config["project_name"],
                            name=config["run_name"],
                            config=config,
                            sync_tensorboard=True,
                            monitor_gym=True,
                            save_code=True,
                        )

                        env = gym.make(
                            "MultiDatasetTradingEnv",
                            dataset_dir="./data/*.pkl",
                            preprocess=preprocess,
                            portfolio_initial_value=1000,
                            trading_fees=0.1/100,
                            borrow_interest_rate=0.02/100/24,
                            reward_function=reward_function_updated,
                            position_range=config["position_range"]
                        )

                        # Création de la callback Wandb
                        wandb_callback = WandbCallback(
                            gradient_save_freq=100,
                            model_save_path=f"models/{run.id}",
                            verbose=2,
                        )

                        model = RecurrentPPO(
                            config["policy_type"],
                            env,
                            verbose=1,
                            learning_rate=config["learning_rate"],
                            ent_coef=config["ent_coef"],
                            batch_size=config["batch_size"],
                            n_steps=config["n_steps"],
                            tensorboard_log=f"runs/{run.id}",
                            gamma=config["gamma"],
                            gae_lambda=config["gae_lambda"]
                        )

                        print(f"--- Démarrage Alpha Hunter ---")
                        print(f"Objectif : Battre le Buy & Hold (Reward = Return - Market)")
                        print(f"Exploration forcée (Ent_coef={config['ent_coef']})")

                        model.learn(
                            total_timesteps=config["total_timesteps"],
                            callback=wandb_callback
                        )

                        print("Fin de l'apprentissage")

                        model.save("recurrent_ppo_alpha_hunter")

                        obs, info = env.reset()
                        done, truncated = False, False

                        while not (done or truncated):
                            action, _states = model.predict(obs, deterministic=True)
                            obs, reward, done, truncated, info = env.step(action)

                        metrics = env.unwrapped.get_metrics()
                        print("Métriques finales :", metrics)

                        market_return = float(metrics.get("Market Return", "0%").replace('%',''))
                        portfolio_return = float(metrics.get("Portfolio Return", "0%").replace('%', ''))
                        metrics_float = {'Market Return (%): ': market_return, 'Portfolio Return (%): ': portfolio_return}
                        wandb.log(metrics_float)

                        wandb.finish()




Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
--- Démarrage Alpha Hunter ---
Objectif : Battre le Buy & Hold (Reward = Return - Market)
Exploration forcée (Ent_coef=0.01)




Logging to runs/i9bwm7ab\RecurrentPPO_1
-----------------------------
| time/              |      |
|    fps             | 1032 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
Market Return : 39.25%   |   Portfolio Return : -94.83%   |   
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.48e+03     |
|    ep_rew_mean          | -245         |
| time/                   |              |
|    fps                  | 599          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0012395335 |
|    clip_fraction        | 0.000635     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.023        |
|    learning_rate        | 0.0001       |



Fin de l'apprentissage
Market Return : 914.64%   |   Portfolio Return : 1418.69%   |   
Métriques finales : {'Market Return': '914.64%', 'Portfolio Return': '1418.69%'}


0,1
Market Return (%):,▁
Portfolio Return (%):,▁
global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇█████
rollout/ep_len_mean,▁▁▁▂▂▂▂▂▂▂▆▆▆▆▆▆▆▆▆▆▆▆████████████▇▇▆▆▆▆
rollout/ep_rew_mean,▇▆▆▆▆▃▃▃▃▃▃▃▃▃▃▁▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▆▇▇███
time/fps,▄▄▁▂▂▂▂▂▂▃▃▃▃▄▅▅▆▆▆▆▆▆▆▆▆▆▅▅▆▆▆▇▇▇▇▇▇███
train/approx_kl,▁▂▃▄▄▂▃▃▂▂▂▃▂▃▂▁▂▄▆▆▅▄▄▅▅▅▅▅▁▁▁██▄▂▂▁▁▁▁
train/clip_fraction,▁▅█▆▇▄▄▅▃▄▄▅▃▃▃▃▁█▇▇▆▅▇▆█▆▇▇▁▂▂▁▂▂▂▁▃▂▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▂▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▆▆▆▆▇█▇▇▇▇▇███████▇▇█

0,1
Market Return (%):,914.64
Portfolio Return (%):,1418.69
global_step,200704
rollout/ep_len_mean,16644
rollout/ep_rew_mean,-198.68948
time/fps,565
train/approx_kl,0.00143
train/clip_fraction,0.00752
train/clip_range,0.2
train/entropy_loss,-1.22397


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
--- Démarrage Alpha Hunter ---
Objectif : Battre le Buy & Hold (Reward = Return - Market)
Exploration forcée (Ent_coef=0.01)




Logging to runs/wddxy6rk\RecurrentPPO_1
-----------------------------
| time/              |      |
|    fps             | 1269 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 689          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0010075937 |
|    clip_fraction        | 0.000342     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.801        |
|    learning_rate        | 0.0001       |
|    loss                 | -0.00115     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00106     |
|    std                  | 0.997        |
|    value_loss    



Fin de l'apprentissage
Market Return :  9.60%   |   Portfolio Return :  2.52%   |   
Métriques finales : {'Market Return': ' 9.60%', 'Portfolio Return': ' 2.52%'}


0,1
Market Return (%):,▁
Portfolio Return (%):,▁
global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁█▆▆▆▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆
rollout/ep_rew_mean,███████▁▄▅▅▅▅▅▅▅▅▅▅▅▅▄▅▅▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇
time/fps,█▂▂▁▁▂▂▂▂▂▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▁▁▂▃▄▅▄▃▄▃▃▅▅▅▅▅▄▆▅▅▆▆▇█▂▃▄▅▂▆▄▅▆▂▆▆▄▄▁
train/clip_fraction,▁▁▂▂▂▅▆▄▃▆▅█▆▄▅▅▅▄▄▅▅▅▅▅▅▅▅▄▅▃▃▄▄▃▃▄▃▃▂▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▁▁▁▁▁▁▂▂▂▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇████▇▇███

0,1
Market Return (%):,9.6
Portfolio Return (%):,2.52
global_step,200704
rollout/ep_len_mean,17780.4
rollout/ep_rew_mean,-341.2409
time/fps,664
train/approx_kl,0.00173
train/clip_fraction,0.00806
train/clip_range,0.2
train/entropy_loss,-1.27248


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
--- Démarrage Alpha Hunter ---
Objectif : Battre le Buy & Hold (Reward = Return - Market)
Exploration forcée (Ent_coef=0.01)




Logging to runs/wtj5grnr\RecurrentPPO_1
-----------------------------
| time/              |      |
|    fps             | 1170 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
Market Return : 39.25%   |   Portfolio Return : -94.37%   |   
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.48e+03     |
|    ep_rew_mean          | -258         |
| time/                   |              |
|    fps                  | 683          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0005994792 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.0131       |
|    learning_rate        | 0.0001       |



Fin de l'apprentissage
Market Return : 39.25%   |   Portfolio Return : 39.25%   |   
Métriques finales : {'Market Return': '39.25%', 'Portfolio Return': '39.25%'}


0,1
Market Return (%):,▁
Portfolio Return (%):,▁
global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇█████
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇███▇▇▇▇▇
rollout/ep_rew_mean,██████████▁▄▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇█████
time/fps,▇▇██████████████████████████▇▇▇▇▇▇▇▇▇▁▁▁
train/approx_kl,▁▂▁▂▂▂▂▃▂▃▂▃▃▃▃▂▂▂▂▂▂▃▃▂▂▂▁▁▁▂▁█▂▂▂▂▃▃▄▄
train/clip_fraction,▁▂▃▄█▇█▇▃▃▂▅▄▄▄▅▃▄▄▅▄▄▁▃▂▂▂▂▁▂▂▆▄▅▄▄▅█▇▇
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
Market Return (%):,39.25
Portfolio Return (%):,39.25
global_step,200704
rollout/ep_len_mean,18396.1
rollout/ep_rew_mean,-295.63608
time/fps,318
train/approx_kl,0.01894
train/clip_fraction,0.13569
train/clip_range,0.2
train/entropy_loss,-1.15999


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
--- Démarrage Alpha Hunter ---
Objectif : Battre le Buy & Hold (Reward = Return - Market)
Exploration forcée (Ent_coef=0.01)




Logging to runs/huxukg7b\RecurrentPPO_1
-----------------------------
| time/              |      |
|    fps             | 654  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
