In [1]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from stable_baselines3 import PPO
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback

# --- 1. CONFIGURATION ET INDICATEURS ---

# On définit les hyperparamètres ici pour que WandB puisse les enregistrer
config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 200_000,
    "learning_rate": 0.0003,
    "ent_coef": 0.02, # Coefficient d'exploration
    "batch_size": 128,
    "positions": [-0.5, 0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5], # Hybride
    "project_name": "RL-Trading-Project"
}

def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(series, slow=26, fast=12, signal=9):
    exp1 = series.ewm(span=fast, adjust=False).mean()
    exp2 = series.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    return macd

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    df['feature_close'] = df['close'].pct_change()
    df['feature_rsi'] = calculate_rsi(df['close']) / 100
    df['feature_macd'] = calculate_macd(df['close'])
    return df.dropna()

def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    reward = np.log(current_val / prev_val)

    # Malus pour les positions Short (pour éviter le biais négatif)
    if history['position', -1] < 0:
        reward -= 0.0002

    return reward

# --- 2. INITIALISATION DE WANDB ---
run = wandb.init(
    project=config["project_name"],
    config=config,
    sync_tensorboard=True, # Synchronise automatiquement les logs SB3
    monitor_gym=True,      # Essaie d'enregistrer les vidéos (si disponible)
    save_code=True,        # Sauvegarde ce script dans WandB
)

# --- 3. CRÉATION DE L'ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

# Wrapper Hybride (Int -> Float spécifique)
env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 4. ENTRAÎNEMENT AVEC CALLBACK WANDB ---
model = PPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    tensorboard_log=f"runs/{run.id}" # Dossier unique pour Tensorboard
)

print(f"Lancement de l'entraînement WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)
model.save("ppo_trading_wandb_final")

# --- 5. ÉVALUATION ET LOGGING FINAL ---
print("Évaluation finale...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _ = model.predict(obs)
    action = int(action) # Conversion array -> int pour le wrapper
    obs, reward, done, truncated, info = env.step(action)

# Récupération des métriques finales de l'environnement
final_metrics = env.unwrapped.get_metrics()
print("Métriques finales :", final_metrics)

# Envoi des métriques clés à WandB (pour le tableau de bord)
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

# --- 6. VISUALISATION ---
env.unwrapped.save_for_render(dir="render_logs")

# On ferme le run WandB proprement
wandb.finish()

In [2]:
# Lancement du renderer local
renderer = Renderer(render_logs_dir="render_logs")
renderer.run()

In [3]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback

# Vous aurez besoin de cette librairie pour le LSTM
# !pip install sb3-contrib

from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION (WandB) ---
config = {
    "policy_type": "MlpLstmPolicy",  # Changement majeur : LSTM
    "total_timesteps": 500_000,      # Un peu plus long pour le LSTM
    "learning_rate": 3e-4,
    "ent_coef": 0.01,
    "batch_size": 128,
    "n_steps": 2048,
    "window_size": 20,               # Fenêtre d'observation pour le LSTM
    "positions": [0, 0.5, 1.0],      # Simplifié au début : Cash, Moitié, Full (Pas de levier/short risqué)
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_Optimized"
}

# --- 2. FONCTIONS DE TRAITEMENT ---

def calculate_indicators(df):
    # RSI
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 # Normalisé

    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close'] # Normalisé par le prix

    # ATR (Volatilité) - Important pour la survie
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']
    
    # Returns
    df['feature_return'] = df['close'].pct_change()
    
    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

def reward_function(history):
    # Rendement logarithmique
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    ret = np.log(current_val / prev_val)
    
    # Pénalité de volatilité (Sharpe Ratio implicite)
    # Cela calme l'agent pour éviter les -50%
    risk_penalty = 0.1 * (ret ** 2)
    
    return ret - risk_penalty

# --- 3. INITIALISATION WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True, # Synchronise les logs SB3 avec WandB
    monitor_gym=True,
    save_code=True,
)

# --- 4. CRÉATION DE L'ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
    window_size=config["window_size"] # Important pour le LSTM
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 5. CRÉATION DU MODÈLE ET ENTRAÎNEMENT ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    # Log dans le dossier spécifique pour que WandB le trouve
    tensorboard_log=f"runs/{run.id}" 
)

print(f"Lancement du run WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_final")

# --- 6. ÉVALUATION ET LOGGING FINAL ---
print("Évaluation...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

# Envoi des métriques finales manuelles
final_metrics = env.unwrapped.get_metrics()
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

# Fin du run
wandb.finish()

In [4]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env

In [5]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()
    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  
date_open                                
2020-08-18 07:00:00 2020-08-18 08:00:00  
2020-08-18 08:00:00 2020-08-18 09:00:00  
2020-08-18 09:00:00 2020-08-18 10:00:00  
2020-08-18 10:00:00 2020-08-18 11:00:00  
2020-08-18 11:00:00 2020-08-18 12:00:00  

In [6]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()

    df['feature_close'] = (df['close'] - df['close'].mean()) / df['close'].std()

    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  feature_close  
date_open                                               
2020-08-18 07:00:00 2020-08-18 08:00:00      -1.891634  
2020-08-18 08:00:00 2020-08-18 09:00:00      -1.891128  
2020-08-18 09:00:00 2020-08-18 10:00:00      -1.892594  
2020-08-18 10:00:00 2020-08-18 11:00:00      -1.890016  
2020-08-18 11:00:00 2020-08-18 12:00:00      -1.894514  

In [7]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

obs, _ = env.reset()
# On veut une position de 88% ETH / 12% USD
obs, reward, terminated, truncated, info = env.step(0.88)
print(obs)
print(info)

In [8]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    position_range=(0, 1),  # ICI : (borne min, borne max)
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

In [9]:
from gym_trading_env.wrapper import DiscreteActionsWrapper

# Vous pouvez aussi appeler le wrapper `env` pour faire plus simple
# Ici, je fais explicitement la distinction entre `wrapper` et `env`
wrapper = DiscreteActionsWrapper(env, positions=[-1, 0, 0.25, 0.5, 0.75, 1, 2])
obs, _ = wrapper.reset()
# On veut une position de 25% ETH / 75% USD ; cela correspond à la position
# d'index 2 dans la liste ci-dessus
obs, reward, terminated, truncated, info = wrapper.step(2)
print(obs)
print(info)

In [10]:
def reward_function(history):
    return history['portfolio_valuation', -1]

env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    # On spécifie la fonction de récompense
    reward_function=reward_function,
)

In [11]:
nb_episodes = 2
for episode in range(1, nb_episodes + 1):
    obs, _ = env.reset()
    print(f'Episode n˚{episode} -- Jeu de donnée {env.name}')
    done = False

    while not done:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

    if terminated:
        print('Argent perdu')
    elif truncated:
        print('Épisode terminé')

In [12]:
def metric_portfolio_valuation(history):
    return round(history['portfolio_valuation', -1], 2)

env.add_metric('Portfolio Valuation', metric_portfolio_valuation)

done = False
obs, _ = env.reset()

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

In [13]:
portfolio_valuation = env.historical_info['portfolio_valuation', -1]
# Si on avait WandB :
# run.summary['portfolio_valuation'] = portfolio_valuation
# On simule ça par un simple print...
print(portfolio_valuation)

In [14]:
metrics = env.get_metrics()
print(metrics)
portfolio_valuation = metrics['Portfolio Valuation']
print(portfolio_valuation)

In [15]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback

# Vous aurez besoin de cette librairie pour le LSTM
# !pip install sb3-contrib

from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION (WandB) ---
config = {
    "policy_type": "MlpLstmPolicy",  # Changement majeur : LSTM
    "total_timesteps": 500_000,      # Un peu plus long pour le LSTM
    "learning_rate": 3e-4,
    "ent_coef": 0.01,
    "batch_size": 128,
    "n_steps": 2048,
    "window_size": 20,               # Fenêtre d'observation pour le LSTM
    "positions": [0, 0.5, 1.0],      # Simplifié au début : Cash, Moitié, Full (Pas de levier/short risqué)
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_Optimized"
}

# --- 2. FONCTIONS DE TRAITEMENT ---

def calculate_indicators(df):
    # RSI
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 # Normalisé

    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close'] # Normalisé par le prix

    # ATR (Volatilité) - Important pour la survie
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']

    # Returns
    df['feature_return'] = df['close'].pct_change()

    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

def reward_function(history):
    # Rendement logarithmique
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    ret = np.log(current_val / prev_val)

    # Pénalité de volatilité (Sharpe Ratio implicite)
    # Cela calme l'agent pour éviter les -50%
    risk_penalty = 0.1 * (ret ** 2)

    return ret - risk_penalty

# --- 3. INITIALISATION WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True, # Synchronise les logs SB3 avec WandB
    monitor_gym=True,
    save_code=True,
)

# --- 4. CRÉATION DE L'ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
    window_size=config["window_size"] # Important pour le LSTM
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 5. CRÉATION DU MODÈLE ET ENTRAÎNEMENT ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    # Log dans le dossier spécifique pour que WandB le trouve
    tensorboard_log=f"runs/{run.id}"
)

print(f"Lancement du run WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_final")

# --- 6. ÉVALUATION ET LOGGING FINAL ---
print("Évaluation...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

# Envoi des métriques finales manuelles
final_metrics = env.unwrapped.get_metrics()
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

# Fin du run
wandb.finish()

In [16]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback

# Vous aurez besoin de cette librairie pour le LSTM
# !pip install sb3-contrib

from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION (WandB) ---
config = {
    "policy_type": "MlpLstmPolicy",  # Changement majeur : LSTM
    "total_timesteps": 500_000,      # Un peu plus long pour le LSTM
    "learning_rate": 3e-4,
    "ent_coef": 0.01,
    "batch_size": 128,
    "n_steps": 2048,
    "positions": [0, 0.5, 1.0],      # Simplifié au début : Cash, Moitié, Full (Pas de levier/short risqué)
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_Optimized"
}

# --- 2. FONCTIONS DE TRAITEMENT ---

def calculate_indicators(df):
    # RSI
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 # Normalisé

    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close'] # Normalisé par le prix

    # ATR (Volatilité) - Important pour la survie
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']
    
    # Returns
    df['feature_return'] = df['close'].pct_change()
    
    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

def reward_function(history):
    # Rendement logarithmique
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    ret = np.log(current_val / prev_val)
    
    # Pénalité de volatilité (Sharpe Ratio implicite)
    # Cela calme l'agent pour éviter les -50%
    risk_penalty = 0.1 * (ret ** 2)
    
    return ret - risk_penalty

# --- 3. INITIALISATION WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True, # Synchronise les logs SB3 avec WandB
    monitor_gym=True,
    save_code=True,
)

# --- 4. CRÉATION DE L'ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
    window_size=config["window_size"] # Important pour le LSTM
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 5. CRÉATION DU MODÈLE ET ENTRAÎNEMENT ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    # Log dans le dossier spécifique pour que WandB le trouve
    tensorboard_log=f"runs/{run.id}" 
)

print(f"Lancement du run WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_final")

# --- 6. ÉVALUATION ET LOGGING FINAL ---
print("Évaluation...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

# Envoi des métriques finales manuelles
final_metrics = env.unwrapped.get_metrics()
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

# Fin du run
wandb.finish()

In [17]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback
from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION ---
config = {
    "policy_type": "MlpLstmPolicy",
    "total_timesteps": 500_000,
    "learning_rate": 3e-4,
    "ent_coef": 0.01,
    "batch_size": 128,
    "n_steps": 2048,
    # "window_size": 20,  <-- SUPPRIMÉ car géré par le LSTM interne
    "positions": [0, 0.5, 1.0],
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_Fix"
}

# --- 2. FONCTIONS DE TRAITEMENT ---
def calculate_indicators(df):
    # RSI
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 

    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close']

    # ATR (Volatilité)
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']
    
    # Returns
    df['feature_return'] = df['close'].pct_change()
    
    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    ret = np.log(current_val / prev_val)
    # Pénalité de volatilité
    risk_penalty = 0.1 * (ret ** 2)
    return ret - risk_penalty

# --- 3. INITIALISATION WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True,
    monitor_gym=True,
    save_code=True,
)

# --- 4. CRÉATION DE L'ENVIRONNEMENT ---
# CORRECTION ICI : Suppression de window_size
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 5. CRÉATION DU MODÈLE ET ENTRAÎNEMENT ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    tensorboard_log=f"runs/{run.id}" 
)

print(f"Lancement du run WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_final")

# --- 6. ÉVALUATION ---
print("Évaluation...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

final_metrics = env.unwrapped.get_metrics()
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

wandb.finish()

In [18]:
# Visualisation locale
env.unwrapped.save_for_render(dir="render_logs")
renderer = Renderer(render_logs_dir="render_logs")
renderer.run()

In [19]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback
from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION "PHASE 2" ---
config = {
    "policy_type": "MlpLstmPolicy",
    "total_timesteps": 1_000_000,    # DOUBLÉ : Le LSTM a besoin de temps
    "learning_rate": 3e-4,
    "ent_coef": 0.01,
    "batch_size": 128,
    "n_steps": 2048,
    
    # CHANGEMENT MAJEUR : On active le SHORT (-1)
    # Positions : [-1 = Short, 0 = Cash, 1 = Long]
    # Toujours pas de levier (1.5) pour l'instant, on veut d'abord qu'il maîtrise le sens.
    "positions": [-1, 0, 1], 
    
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_Phase2_ShortEnabled"
}

# --- 2. INDICATEURS (Inchangés car robustes) ---
def calculate_indicators(df):
    # RSI
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 

    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close']

    # ATR (Volatilité)
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']
    
    # Returns
    df['feature_return'] = df['close'].pct_change()
    
    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

# --- 3. RÉCOMPENSE AJUSTÉE ---
def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    ret = np.log(current_val / prev_val)
    
    # AJUSTEMENT : Pénalité réduite (0.05 au lieu de 0.1)
    # On laisse l'agent prendre un peu plus de risques pour chercher du profit.
    risk_penalty = 0.05 * (ret ** 2)
    
    return ret - risk_penalty

# --- 4. INIT WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True,
    monitor_gym=True,
    save_code=True,
)

# --- 5. ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
    # Rappel : Pas de window_size ici, le LSTM gère sa mémoire
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 6. MODÈLE & ENTRAÎNEMENT ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    tensorboard_log=f"runs/{run.id}" 
)

print(f"Lancement du run WandB : {run.name} (Short activé)")

model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_short_enabled")

# --- 7. ÉVALUATION FINALE ---
print("Évaluation finale...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

final_metrics = env.unwrapped.get_metrics()
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

wandb.finish()

In [20]:
# Rendu visuel
env.unwrapped.save_for_render(dir="render_logs")
renderer = Renderer(render_logs_dir="render_logs")
renderer.run()

In [21]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback
from sb3_contrib import RecurrentPPO

# --- 1. CONFIGURATION "ALPHA HUNTER" ---
config = {
    "policy_type": "MlpLstmPolicy",
    "total_timesteps": 1_500_000,    # On allonge encore, l'Alpha est dur à trouver
    "learning_rate": 3e-4,
    
    # CHANGEMENT CRUCIAL : Entropie x5
    # Cela force l'agent à essayer des actions "bizarres" (comme shorter en bull run)
    # au lieu de s'endormir sur une position Long.
    "ent_coef": 0.05,  
    
    "batch_size": 256, # Batch plus gros pour lisser le bruit des returns
    "n_steps": 2048,
    "positions": [-1, 0, 1], 
    "project_name": "RL-Trading-Project",
    "run_name": "RecurrentPPO_AlphaHunter"
}

# --- 2. TRAITEMENT (Inchangé) ---
def calculate_indicators(df):
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['feature_rsi'] = 100 - (100 / (1 + rs))
    df['feature_rsi'] = df['feature_rsi'] / 100.0 

    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['feature_macd'] = (exp1 - exp2) / df['close']

    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['feature_atr'] = true_range.rolling(14).mean() / df['close']
    
    df['feature_return'] = df['close'].pct_change()
    
    return df.dropna()

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    return calculate_indicators(df)

# --- 3. RÉCOMPENSE DIFFERENCIELLE (ALPHA) ---
def reward_function(history):
    # Performance de l'agent
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    portfolio_ret = np.log(current_val / prev_val)
    
    # Performance du marché (Data "close" est souvent la colonne 0 ou accessible via history)
    # Gym-trading-env stocke les données brutes dans history['data_close', t]
    current_price = history['data_close', -1]
    prev_price = history['data_close', -2]
    market_ret = np.log(current_price / prev_price)
    
    # RECOMPENSE = ALPHA (Surperformance)
    # Si l'agent fait pareil que le marché, Reward = 0.
    # S'il fait mieux (ex: cash quand ça baisse, ou short), Reward > 0.
    reward = portfolio_ret - market_ret
    
    # Petit bonus pour l'action (éviter la léthargie)
    # reward += 0.00001 
    
    return reward 

# --- 4. RUN WANDB ---
run = wandb.init(
    project=config["project_name"],
    name=config["run_name"],
    config=config,
    sync_tensorboard=True,
    monitor_gym=True,
    save_code=True,
)

# --- 5. ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 6. MODÈLE ---
model = RecurrentPPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"], # C'est ici que ça se joue
    batch_size=config["batch_size"],
    n_steps=config["n_steps"],
    tensorboard_log=f"runs/{run.id}" 
)

print(f"--- Démarrage Alpha Hunter ---")
print(f"Objectif : Battre le Buy & Hold (Reward = Return - Market)")
print(f"Exploration forcée (Ent_coef={config['ent_coef']})")

model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)

model.save("recurrent_ppo_alpha_hunter")

# --- 7. EVALUATION ---
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, truncated, info = env.step(action)

metrics = env.unwrapped.get_metrics()
print("Métriques finales :", metrics)

wandb.finish()