In [1]:
# Deep Reinforcement Learning for Automated Stock Trading – for Crypto

### 📊 Summary of Crypto DRL Ensemble Trading

This notebook implements a deep reinforcement learning ensemble strategy adapted from the ICAIF 2020 paper, applied to cryptocurrency trading.

#- **Assets Used**: BTC, ETH, BNB, XRP
#- **Time Period**: 2018 to 2024, with rolling windows
#- **Agents Used**: PPO, A2C, DDPG (Stable-Baselines3)
#- **Strategy**:
#  - Train each agent on a rolling window (e.g. 6 months)
#   - Validate on 2 months of unseen data
#   - Select the model with the highest Sharpe ratio
#   - Trade using the best model for 2 months
# - **Objective**: Maximize returns while adjusting for volatility


In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import warnings


warnings.filterwarnings("ignore")

sys.path.append("..")

# FinRL modules
# from finrl.config_tickers import DOW_30_TICKER
# from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.binancedownloader import BinanceDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_crypto_trading.env_cryptotrading import CryptoTradingEnv
from finrl.agents.stablebaselines3.models import DRLAgent
# from finrl.agents.stablebaselines3.models import DRLEnsembleAgent
from finrl.plot.plot import backtest_plot, get_daily_return, get_baseline
from finrl.plot.plot import backtest_stats_qs as backtest_stats  # using QuantStats


In [2]:
from finrl.config import (TRAIN_START_DATE, TRAIN_END_DATE, TEST_START_DATE, TEST_END_DATE, TRADE_START_DATE, TRADE_END_DATE)

START_DATE = TRAIN_START_DATE 
END_DATE =  TRADE_END_DATE

TRAIN_WINDOW_MONTHS = 6
VALIDATION_WINDOW_MONTHS = 2
TRADE_WINDOW_MONTHS = 2

# data_path = "../data/binance_raw.csv"
data_path = "../data/binance_less_raw.csv"

if os.path.exists(data_path):
    print(" Loading Binance data from local cache...")
    df_raw = pd.read_csv(data_path, parse_dates=["date"])
else:
    print(" Downloading fresh Binance data...")
    # tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT"]
    tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "XRPUSDT"]
    bd = BinanceDownloader()
    df_raw = bd.download_multiple(ticker_list=tickers, start_str="1 Jan, 2012")
    df_raw.to_csv(data_path, index=False)
print("Data saved to:", data_path)

print("Done...")


 Loading Binance data from local cache...
Data saved to: ../data/binance_less_raw.csv
Done...


In [3]:
df_raw.groupby("tic")["date"].min()

tic
BNB-USD   2017-11-06
BTC-USD   2017-08-17
ETH-USD   2017-08-17
XRP-USD   2018-05-04
Name: date, dtype: datetime64[ns]

In [4]:

# df_raw = YahooDownloader(start_date=TRAIN_START_DATE,
#                          end_date=TRADE_END_DATE,
#                          ticker_list=ticker_list).fetch_data()

fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list=["macd", "rsi_30", "cci_30"], 
                     use_vix=False,
                     use_turbulence=True
                     )

# user_defined_feature=False

df_processed = fe.preprocess_data(df_raw)  # D: removed processing from original



Successfully added technical indicators
Successfully added turbulence index


In [5]:
from dateutil.relativedelta import relativedelta
from finrl.utils.rolling_windows import get_rolling_windows

# PPO: Good for stable exploration with large batches
# A2C: Low n_steps means quick updates (crypto works well with this)
# DDPG: Buffer and batch sizes are fine; could experiment with larger buffer_size if needed

A2C_model_kwargs = {
    'n_steps': 5,
    'ent_coef': 0.005,
    'learning_rate': 0.0007
}

PPO_model_kwargs = {
    'ent_coef': 0.01,
    'n_steps': 2048,
    'learning_rate': 0.00025,
    'batch_size': 128
}

DDPG_model_kwargs = {
    'buffer_size': 10000,
    'learning_rate': 0.0005,
    'batch_size': 64
}

TRAIN_WINDOW_MONTHS = 6
VALIDATION_WINDOW_MONTHS = 2
TRADE_WINDOW_MONTHS = 2

windows = get_rolling_windows(
    train_months=TRAIN_WINDOW_MONTHS,
    val_months=VALIDATION_WINDOW_MONTHS,
    trade_months=TRADE_WINDOW_MONTHS
)

print(f"Generated {len(windows)} rolling windows from {TRAIN_START_DATE} to {TRADE_END_DATE}.")



🔄 Created 16 rolling windows.
Generated 16 rolling windows from 2020-05-04 to 2024-12-31.


In [6]:
from stable_baselines3 import PPO, A2C, DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
#  rolling window backtest using three reinforcement learning agents (PPO, A2C, DDPG). 
results = []
EPISODES = 10

for i, (train_start, train_end, val_end, trade_end) in enumerate(windows):
    print(" Rolling Window {i+1}: {train_start.date()} to {trade_end.date()}")
    
    train_data = data_split(df_processed, train_start, train_end)
    val_data = data_split(df_processed, train_end, val_end)
    trade_data = data_split(df_processed, val_end, trade_end)

    min_days_required = 30

    if len(train_data["date"].unique()) < min_days_required:
        print(f"  Skipping Window {i+1} — Train window too short: {len(train_data['date'].unique())} days")
        continue
    
    if len(val_data["date"].unique()) < min_days_required:
        print(f"  Skipping Window {i+1} — Validation window too short: {len(val_data['date'].unique())} days")
        continue
        
    val_returns = val_data.groupby("date")["close"].mean().pct_change().dropna()
    if val_returns.std() == 0 or val_returns.empty:
        print(f"  Skipping Window {i+1} — No price volatility in validation window.")
        continue

    print(f" Window {i+1}")
    print(f"  Train window: {train_start.date()} to {train_end.date()} — {len(train_data['date'].unique())} days")
    print(f"  Val window  : {train_end.date()} to {val_end.date()}   — {len(val_data['date'].unique())} days")
    print(f"  Trade window: {val_end.date()} to {trade_end.date()} — {len(trade_data['date'].unique())} days")


    val_returns = val_data.groupby("date")["close"].mean().pct_change().dropna()
    if val_returns.std() == 0 or val_returns.empty:
        print(f"  Validation window {i+1} has no price volatility. Skipping.")
        continue

    env_train = DummyVecEnv([lambda: CryptoTradingEnv(train_data)])
    agent = DRLAgent(env=env_train)

      # replaced  "ppo": agent.train_PPO(total_timesteps=len(train_data)*EPISODES, model_kwargs=PPO_model_kwargs),
    print(val_returns.describe())

    models = {
        "ppo": agent.train_PPO(total_timesteps=len(train_data)*30, model_kwargs=PPO_model_kwargs),
        "a2c": agent.train_A2C(total_timesteps=len(train_data)*30, model_kwargs=A2C_model_kwargs),
        "ddpg": agent.train_DDPG(total_timesteps=int(len(train_data)*30*0.5), model_kwargs=DDPG_model_kwargs)
    }

    best_model = None
    best_sharpe = -np.inf
    obs = env_train.reset()
    sample_action, _ = models["ppo"].predict(obs)
    print(" Sample PPO action:", sample_action)

    
    for name, model in models.items():
        env_val = DummyVecEnv([lambda: CryptoTradingEnv(val_data)])
        
        print(f"{name} Account Memory Sample: {env_val.envs[0].asset_memory[:5]}")
        print(f"Is Model Trained? {'Yes' if model else 'No'}")

        sharpe = DRLAgent.DRL_prediction(model=model, environment=env_val, evaluate=True)
        
        print("{} Sharpe: {}".format(name, sharpe))
        # Debug: print account value
        account_vals = env_val.envs[0].asset_memory
        print("{} Account Value Range: {:.2f} to {:.2f}".format(name, min(account_vals), max(account_vals)))
        if not np.isnan(sharpe) and sharpe > best_sharpe:
            best_model = model
            best_sharpe = sharpe

    print(" Best model: {best_model.__class__.__name__} with Sharpe {best_sharpe:.2f}")

    env_trade = DummyVecEnv([lambda: CryptoTradingEnv(trade_data)])
    print(f"{name} Account Memory Sample: {env_trade.envs[0].asset_memory[:5]}")
    
    if best_model is not None:
        df_result = DRLAgent.DRL_prediction(model=best_model, environment=env_trade)
        account_vals = df_result["account_value"].values.tolist()

        print(f"{best_model.__class__.__name__} Account Value Range: {min(account_vals):.2f} to {max(account_vals):.2f}")

        results.append(df_result)
    else:
        print(f"❌ No valid model selected in window {i+1} — skipping trade step.")


    if results:
        df_final = pd.concat(results)
        df_final.reset_index(drop=True, inplace=True)
    
        # Save for later use
        df_final.to_csv("../results/crypto_account_values.csv", index=False)
        print("Results saved to results/crypto_account_values.csv")
    else:
        print("No results to analyze.")



 Rolling Window {i+1}: {train_start.date()} to {trade_end.date()}
 Window 1
  Train window: 2020-05-04 to 2020-11-04 — 184 days
  Val window  : 2020-11-04 to 2021-01-04   — 61 days
  Trade window: 2021-01-04 to 2021-03-04 — 59 days
count    60.000000
mean      0.014905
std       0.037071
min      -0.083952
25%      -0.008187
50%       0.011502
75%       0.038438
max       0.100382
Name: close, dtype: float64
Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1214 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 990           |
|    iterations           | 2             |
|    time_elapsed         | 4             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 2.4306995e-05 |
|    clip

In [None]:
df_final = pd.concat(results)
backtest_stats(df_final)
backtest_plot(df_final)