In [1]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env

In [2]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()
    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  
date_open                                
2020-08-18 07:00:00 2020-08-18 08:00:00  
2020-08-18 08:00:00 2020-08-18 09:00:00  
2020-08-18 09:00:00 2020-08-18 10:00:00  
2020-08-18 10:00:00 2020-08-18 11:00:00  
2020-08-18 11:00:00 2020-08-18 12:00:00  

In [3]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()

    df['feature_close'] = (df['close'] - df['close'].mean()) / df['close'].std()

    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  feature_close  
date_open                                               
2020-08-18 07:00:00 2020-08-18 08:00:00      -1.891634  
2020-08-18 08:00:00 2020-08-18 09:00:00      -1.891128  
2020-08-18 09:00:00 2020-08-18 10:00:00      -1.892594  
2020-08-18 10:00:00 2020-08-18 11:00:00      -1.890016  
2020-08-18 11:00:00 2020-08-18 12:00:00      -1.894514  

In [4]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

obs, _ = env.reset()
# On veut une position de 88% ETH / 12% USD
obs, reward, terminated, truncated, info = env.step(0.88)
print(obs)
print(info)

In [5]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    position_range=(0, 1),  # ICI : (borne min, borne max)
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

In [6]:
from gym_trading_env.wrapper import DiscreteActionsWrapper

# Vous pouvez aussi appeler le wrapper `env` pour faire plus simple
# Ici, je fais explicitement la distinction entre `wrapper` et `env`
wrapper = DiscreteActionsWrapper(env, positions=[-1, 0, 0.25, 0.5, 0.75, 1, 2])
obs, _ = wrapper.reset()
# On veut une position de 25% ETH / 75% USD ; cela correspond à la position
# d'index 2 dans la liste ci-dessus
obs, reward, terminated, truncated, info = wrapper.step(2)
print(obs)
print(info)

In [7]:
def reward_function(history):
    return history['portfolio_valuation', -1]

env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    # On spécifie la fonction de récompense
    reward_function=reward_function,
)

In [8]:
nb_episodes = 2
for episode in range(1, nb_episodes + 1):
    obs, _ = env.reset()
    print(f'Episode n˚{episode} -- Jeu de donnée {env.name}')
    done = False

    while not done:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

    if terminated:
        print('Argent perdu')
    elif truncated:
        print('Épisode terminé')

In [9]:
def metric_portfolio_valuation(history):
    return round(history['portfolio_valuation', -1], 2)

env.add_metric('Portfolio Valuation', metric_portfolio_valuation)

done = False
obs, _ = env.reset()

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

In [10]:
portfolio_valuation = env.historical_info['portfolio_valuation', -1]
# Si on avait WandB :
# run.summary['portfolio_valuation'] = portfolio_valuation
# On simule ça par un simple print...
print(portfolio_valuation)

In [11]:
metrics = env.get_metrics()
print(metrics)
portfolio_valuation = metrics['Portfolio Valuation']
print(portfolio_valuation)

In [12]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
import wandb
from wandb.integration.sb3 import WandbCallback
import matplotlib.pyplot as plt
import os
from stable_baselines3.common.utils import get_latest_run_id


## ----------------------------------------------------------------------
## A. FONCTIONS DE PRÉTRAITEMENT ET DE RÉCOMPENSE
## ----------------------------------------------------------------------

def preprocess_v2(df):
    df = df.sort_index().dropna().drop_duplicates()
    # Log Returns
    df["feature_log_returns"] = np.log(df["close"]).diff()
    # Indicateurs de Volatilité (ATR simplifié)
    df['tr1'] = df['high'] - df['low']
    df['tr2'] = np.abs(df['high'] - df['close'].shift(1))
    df['tr3'] = np.abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['tr1', 'tr2', 'tr3']].max(axis=1)
    df['feature_atr'] = df['tr'].rolling(window=14).mean() / df["close"]
    # Indicateurs de Tendance (MACD)
    ema_fast = df['close'].ewm(span=12, adjust=False).mean()
    ema_slow = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = ema_fast - ema_slow
    df['feature_macd_signal'] = df['feature_macd'].ewm(span=9, adjust=False).mean()
    # Indicateurs de Momentum (RSI)
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    # Normalisation finale
    df['feature_rsi'] = 100 - (100 / (1 + rs)) / 100
    df = df.dropna()
    cols_to_normalize = ['feature_log_returns', 'feature_macd', 'feature_macd_signal', 'feature_atr']
    for col in cols_to_normalize:
        if df[col].std() > 0:
            df[col] = (df[col] - df[col].mean()) / df[col].std()
        else:
             df[col] = 0.0
    return df

def reward_function_v2(history):
    # Log-Return différentiel
    prev_val = history['portfolio_valuation', -2]
    curr_val = history['portfolio_valuation', -1]
    if prev_val == 0: return 0
    reward = np.log(curr_val / prev_val)
    return reward

## ----------------------------------------------------------------------
## B. CUSTOM CALLBACK WANDB (Métriques financières)
## ----------------------------------------------------------------------

class CustomTradingCallback(BaseCallback):
    def __init__(self, verbose: int = 0):
        super().__init__(verbose)
        self.episode_num = 0

    def _on_step(self) -> bool:
        if self.locals['dones'][0]:
            self.episode_num += 1
            raw_env = self.training_env.envs[0].unwrapped
            metrics = raw_env.get_metrics()

            # Récupération des retours pour le calcul de performance
            market_return_str = metrics.get('Market Return', '0.00%').strip()
            market_return = float(market_return_str.strip('%')) / 100
            portfolio_return_str = metrics.get('Portfolio Return', '0.00%').strip()
            portfolio_return = float(portfolio_return_str.strip('%')) / 100

            if self.logger is not None:
                self.logger.record("episode/final_portfolio_valuation", metrics.get('Portfolio Valuation'))
                self.logger.record("episode/return_vs_market_pct", (portfolio_return - market_return) * 100)
                self.logger.record("episode/total_portfolio_return_pct", portfolio_return * 100)
                self.logger.record("episode/market_return_pct", market_return * 100)
                self.logger.record("episode/steps", raw_env.step) # CORRECTION : raw_env.step

                self.logger.dump(step=self.num_timesteps)

        return True

In [13]:
# --- 1. HYPERPARAMÈTRES ET CONFIGURATION GLOBALE ---
config = {
    "policy_type": "MlpLstmPolicy",
    "total_timesteps": 100_000,
    "env_id": "MultiDatasetTradingEnv",
    "learning_rate": 3e-4,
    "n_steps": 2048,
    "batch_size": 128,
    "ent_coef": 0.01,
    "portfolio_initial_value": 1_000,
    "trading_fees": 0.1/100,
    "borrow_interest_rate": 0.02/100/24,
    "positions_range": (-1, 1),
    "model_name": "mon_agent_trading" # Nom de base pour la sauvegarde
}

# --- 2. INITIALISATION DE WANDB ET RÉCUPÉRATION DU CHEMIN DE SAUVEGARDE ---
run = wandb.init(
    project="RL-Trading-Project",
    entity="arthur-collignon-cpe-lyon",
    config=config,
    sync_tensorboard=True,
    monitor_gym=True,
    save_code=True,
)

# Chemin où SB3 sauvegardera le modèle (dans le dossier WandB)
# Nous stockons ce chemin dans une variable globale pour le backtesting.
MODEL_SAVE_PATH = f"models/{run.id}/{config['model_name']}.zip"
WANDB_RUN_ID = run.id # Sauvegarde de l'ID du run actuel

# --- 3. CRÉATION DE L'ENVIRONNEMENT ET DU MODÈLE ---
env = gym.make(
    config["env_id"],
    dataset_dir="data/*.pkl",
    preprocess=preprocess_v2,
    reward_function=reward_function_v2,
    position_range=config["positions_range"],
    portfolio_initial_value=config["portfolio_initial_value"],
    trading_fees=config["trading_fees"],
    borrow_interest_rate=config["borrow_interest_rate"],
)
env = DummyVecEnv([lambda: env])

model = RecurrentPPO(
    config["policy_type"], env, verbose=0, **{k: config[k] for k in ['learning_rate', 'n_steps', 'batch_size', 'ent_coef']}
)

# --- 4. DÉFINITION DE LA LISTE DE CALLBACKS ---
callback = CallbackList([
    WandbCallback(
        model_save_path=f"models/{run.id}",
        verbose=0,
        model_save_freq=10000,
        # Nom de fichier personnalisé (pour le rendre facilement chargeable)
        # Note: ceci nécessite un hack pour s'assurer que le nom est constant
        # La sauvegarde finale sera gérée manuellement.
    ),
    CustomTradingCallback(verbose=0),
])

# --- 5. ENTRAÎNEMENT ET SAUVEGARDE FINALE ---
try:
    print("Début de l'entraînement avec WandB...")
    model.learn(
        total_timesteps=config["total_timesteps"],
        callback=callback,
    )
    # Sauvegarde finale manuelle dans le chemin exact
    model.save(MODEL_SAVE_PATH)
    print(f"Modèle sauvegardé dans : {MODEL_SAVE_PATH}")
finally:
    run.finish()