In [1]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from stable_baselines3 import PPO
from gym_trading_env.renderer import Renderer
import numpy as np

# --- 1. Prétraitement (Inchangé) ---
def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    df['feature_close'] = df['close'].pct_change()
    df['feature_rsi'] = calculate_rsi(df['close']) / 100
    df['feature_macd'] = calculate_macd(df['close'])
    return df.dropna()

# --- 2. Reward Function avec "Malus de Short" ---
def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    reward = np.log(current_val / prev_val)

    # --- CORRECTION DU PROBLÈME DE SHORT ---
    current_pos = history['position', -1]

    # Si l'agent est en position négative (Short), on le punit légèrement
    # à chaque step. Cela l'oblige à ne short que s'il est VRAIMENT sûr de gagner gros.
    if current_pos < 0:
        reward -= 0.0002  # Petite "taxe" sur le pessimisme

    return reward

# --- 3. Création de l'environnement ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

# --- 4. LA SOLUTION AUX DEUX PROBLÈMES ---
# On garde le Wrapper (donc l'agent sort des INT), mais on définit
# une liste précise qui favorise l'achat (plus de choix positifs).
custom_positions = [
    -0.5,  # Le seul choix de Short (modéré)
    0,     # Cash (Neutre)
    0.25,  # Petit investissement
    0.5,   # Investissement moyen
    0.75,  # Gros investissement
    1.0,   # All-in
    1.25,  # Petit Levier
    1.5    # Gros Levier
]

env = DiscreteActionsWrapper(env, positions=custom_positions)

# --- 5. Agent PPO ---
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    # ent_coef force l'exploration. Si ton agent fait toujours la même chose,
    # augmente cette valeur (ex: 0.05)
    ent_coef=0.02,
    tensorboard_log="./ppo_hybrid_tensorboard/"
)

print("Entraînement avec actions hybrides...")
model.learn(total_timesteps=150_000)
model.save("ppo_hybrid_solution")

In [2]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env

In [3]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()
    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  
date_open                                
2020-08-18 07:00:00 2020-08-18 08:00:00  
2020-08-18 08:00:00 2020-08-18 09:00:00  
2020-08-18 09:00:00 2020-08-18 10:00:00  
2020-08-18 10:00:00 2020-08-18 11:00:00  
2020-08-18 11:00:00 2020-08-18 12:00:00  

In [4]:
def preprocess(df):
    df = df.sort_index()
    df = df.dropna()
    df = df.drop_duplicates()

    df['feature_close'] = (df['close'] - df['close'].mean()) / df['close'].std()

    return df

df = preprocess(pd.read_pickle('./data/binance-ETHUSD-1h.pkl'))
df.head(5)

                       open    high     low   close       volume  \
date_open                                                          
2020-08-18 07:00:00  430.00  435.00  410.00  430.30   487.154463   
2020-08-18 08:00:00  430.27  431.79  430.27  430.80   454.176153   
2020-08-18 09:00:00  430.86  431.13  428.71  429.35  1183.710884   
2020-08-18 10:00:00  429.75  432.69  428.59  431.90  1686.183227   
2020-08-18 11:00:00  432.09  432.89  426.99  427.45  1980.692724   

                             date_close  feature_close  
date_open                                               
2020-08-18 07:00:00 2020-08-18 08:00:00      -1.891634  
2020-08-18 08:00:00 2020-08-18 09:00:00      -1.891128  
2020-08-18 09:00:00 2020-08-18 10:00:00      -1.892594  
2020-08-18 10:00:00 2020-08-18 11:00:00      -1.890016  
2020-08-18 11:00:00 2020-08-18 12:00:00      -1.894514  

In [5]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

obs, _ = env.reset()
# On veut une position de 88% ETH / 12% USD
obs, reward, terminated, truncated, info = env.step(0.88)
print(obs)
print(info)

In [6]:
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    position_range=(0, 1),  # ICI : (borne min, borne max)
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
)

In [7]:
from gym_trading_env.wrapper import DiscreteActionsWrapper

# Vous pouvez aussi appeler le wrapper `env` pour faire plus simple
# Ici, je fais explicitement la distinction entre `wrapper` et `env`
wrapper = DiscreteActionsWrapper(env, positions=[-1, 0, 0.25, 0.5, 0.75, 1, 2])
obs, _ = wrapper.reset()
# On veut une position de 25% ETH / 75% USD ; cela correspond à la position
# d'index 2 dans la liste ci-dessus
obs, reward, terminated, truncated, info = wrapper.step(2)
print(obs)
print(info)

In [8]:
def reward_function(history):
    return history['portfolio_valuation', -1]

env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    # On spécifie la fonction de récompense
    reward_function=reward_function,
)

In [9]:
nb_episodes = 2
for episode in range(1, nb_episodes + 1):
    obs, _ = env.reset()
    print(f'Episode n˚{episode} -- Jeu de donnée {env.name}')
    done = False

    while not done:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

    if terminated:
        print('Argent perdu')
    elif truncated:
        print('Épisode terminé')

In [10]:
def metric_portfolio_valuation(history):
    return round(history['portfolio_valuation', -1], 2)

env.add_metric('Portfolio Valuation', metric_portfolio_valuation)

done = False
obs, _ = env.reset()

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

In [11]:
portfolio_valuation = env.historical_info['portfolio_valuation', -1]
# Si on avait WandB :
# run.summary['portfolio_valuation'] = portfolio_valuation
# On simule ça par un simple print...
print(portfolio_valuation)

In [12]:
metrics = env.get_metrics()
print(metrics)
portfolio_valuation = metrics['Portfolio Valuation']
print(portfolio_valuation)

In [13]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
import os

In [14]:
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(series, slow=26, fast=12, signal=9):
    exp1 = series.ewm(span=fast, adjust=False).mean()
    exp2 = series.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    return macd

In [15]:
def preprocess(df):
    # Tri et nettoyage
    df = df.sort_index().dropna().drop_duplicates()

    # Ajout de features (doivent commencer par "feature_")
    # 1. RSI normalisé entre 0 et 1
    df['feature_RSI'] = calculate_rsi(df['close']) / 100

    # 2. MACD
    df['feature_MACD'] = calculate_macd(df['close'])

    # 3. Rendements logarithmiques (plus stable pour le RL que le prix brut)
    df['feature_log_return'] = np.log(df['close'] / df['close'].shift(1))

    # 4. Position du prix par rapport à la moyenne mobile
    df['feature_sma_dist'] = (df['close'] - df['close'].rolling(20).mean()) / df['close'].rolling(20).std()

    return df.dropna()

In [16]:
def reward_function(history):
    # Récompense basée sur la variation logarithmique de la valeur du portefeuille
    # Cela encourage une croissance stable plutôt que des paris risqués
    if len(history["portfolio_valuation"]) < 2:
        return 0
    return np.log(history['portfolio_valuation', -1] / history['portfolio_valuation', -2])

In [17]:
# Création de l'environnement avec les contraintes du projet
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl", # Assure-toi que le dossier data contient tes fichiers .pkl
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

In [18]:
print(f"Démarrage de la simulation sur le dataset : {env.unwrapped.name}")

obs, info = env.reset()
done = False
truncated = False

while not (done or truncated):
    # Ici, tu remplaceras par : action, _states = model.predict(obs) si tu utilises Stable Baselines
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)

In [19]:
print(f"Démarrage de la simulation sur le dataset : {env.unwrapped.name}")

obs, info = env.reset()
done = False
truncated = False

while not (done or truncated):
    # Ici, tu remplaceras par : action, _states = model.predict(obs) si tu utilises Stable Baselines
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)

In [20]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from stable_baselines3 import PPO
from gym_trading_env.renderer import Renderer
import numpy as np

# --- 1. Prétraitement (Inchangé) ---
def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    df['feature_close'] = df['close'].pct_change()
    df['feature_rsi'] = calculate_rsi(df['close']) / 100
    df['feature_macd'] = calculate_macd(df['close'])
    return df.dropna()

# --- 2. Reward Function avec "Malus de Short" ---
def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    reward = np.log(current_val / prev_val)

    # --- CORRECTION DU PROBLÈME DE SHORT ---
    current_pos = history['position', -1]

    # Si l'agent est en position négative (Short), on le punit légèrement
    # à chaque step. Cela l'oblige à ne short que s'il est VRAIMENT sûr de gagner gros.
    if current_pos < 0:
        reward -= 0.0002  # Petite "taxe" sur le pessimisme

    return reward

# --- 3. Création de l'environnement ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

# --- 4. LA SOLUTION AUX DEUX PROBLÈMES ---
# On garde le Wrapper (donc l'agent sort des INT), mais on définit
# une liste précise qui favorise l'achat (plus de choix positifs).
custom_positions = [
    -0.5,  # Le seul choix de Short (modéré)
    0,     # Cash (Neutre)
    0.25,  # Petit investissement
    0.5,   # Investissement moyen
    0.75,  # Gros investissement
    1.0,   # All-in
    1.25,  # Petit Levier
    1.5    # Gros Levier
]

env = DiscreteActionsWrapper(env, positions=custom_positions)

# --- 5. Agent PPO ---
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    # ent_coef force l'exploration. Si ton agent fait toujours la même chose,
    # augmente cette valeur (ex: 0.05)
    ent_coef=0.02,
    tensorboard_log="./ppo_hybrid_tensorboard/"
)

print("Entraînement avec actions hybrides...")
model.learn(total_timesteps=150_000)
model.save("ppo_hybrid_solution")

In [21]:
# --- 6. Simulation ---
print("Lancement de la simulation...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _ = model.predict(obs)

    # Ici, 'action' est un INT (l'index dans la liste custom_positions)
    # On le convertit en int Python pur pour éviter le bug numpy
    action = int(action)

    obs, reward, done, truncated, info = env.step(action)

# Sauvegarde
env.unwrapped.save_for_render(dir="render_logs")
renderer = Renderer(render_logs_dir="render_logs")
renderer.run()

In [22]:
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
from stable_baselines3 import PPO
from gym_trading_env.renderer import Renderer
import numpy as np
import pandas as pd
import wandb
from wandb.integration.sb3 import WandbCallback

# --- 1. CONFIGURATION ET INDICATEURS ---

# On définit les hyperparamètres ici pour que WandB puisse les enregistrer
config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 200_000,
    "learning_rate": 0.0003,
    "ent_coef": 0.02, # Coefficient d'exploration
    "batch_size": 128,
    "positions": [-0.5, 0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5], # Hybride
    "project_name": "RL-Trading-Project"
}

def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(series, slow=26, fast=12, signal=9):
    exp1 = series.ewm(span=fast, adjust=False).mean()
    exp2 = series.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    return macd

def preprocess(df):
    df = df.sort_index().dropna().drop_duplicates()
    df['feature_close'] = df['close'].pct_change()
    df['feature_rsi'] = calculate_rsi(df['close']) / 100
    df['feature_macd'] = calculate_macd(df['close'])
    return df.dropna()

def reward_function(history):
    current_val = history['portfolio_valuation', -1]
    prev_val = history['portfolio_valuation', -2]
    reward = np.log(current_val / prev_val)
    
    # Malus pour les positions Short (pour éviter le biais négatif)
    if history['position', -1] < 0:
        reward -= 0.0002
    
    return reward

# --- 2. INITIALISATION DE WANDB ---
run = wandb.init(
    project=config["project_name"],
    config=config,
    sync_tensorboard=True, # Synchronise automatiquement les logs SB3
    monitor_gym=True,      # Essaie d'enregistrer les vidéos (si disponible)
    save_code=True,        # Sauvegarde ce script dans WandB
)

# --- 3. CRÉATION DE L'ENVIRONNEMENT ---
env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/*.pkl",
    preprocess=preprocess,
    portfolio_initial_value=1000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    reward_function=reward_function,
)

# Wrapper Hybride (Int -> Float spécifique)
env = DiscreteActionsWrapper(env, positions=config["positions"])

# --- 4. ENTRAÎNEMENT AVEC CALLBACK WANDB ---
model = PPO(
    config["policy_type"],
    env,
    verbose=1,
    learning_rate=config["learning_rate"],
    ent_coef=config["ent_coef"],
    batch_size=config["batch_size"],
    tensorboard_log=f"runs/{run.id}" # Dossier unique pour Tensorboard
)

print(f"Lancement de l'entraînement WandB : {run.name}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    )
)
model.save("ppo_trading_wandb_final")

# --- 5. ÉVALUATION ET LOGGING FINAL ---
print("Évaluation finale...")
obs, info = env.reset()
done, truncated = False, False

while not (done or truncated):
    action, _ = model.predict(obs)
    action = int(action) # Conversion array -> int pour le wrapper
    obs, reward, done, truncated, info = env.step(action)

# Récupération des métriques finales de l'environnement
final_metrics = env.unwrapped.get_metrics()
print("Métriques finales :", final_metrics)

# Envoi des métriques clés à WandB (pour le tableau de bord)
wandb.log({
    "final_portfolio_valuation": info['portfolio_valuation'],
    "market_return": final_metrics.get("Market Return", 0),
    "portfolio_return": final_metrics.get("Portfolio Return", 0)
})

# --- 6. VISUALISATION ---
env.unwrapped.save_for_render(dir="render_logs")

# On ferme le run WandB proprement
wandb.finish()