In [1]:
# Deep Reinforcement Learning for Automated Stock Trading – for Crypto

### 📊 Summary of Crypto DRL Ensemble Trading

# This notebook implements a deep reinforcement learning ensemble strategy adapted from the ICAIF 2020 paper, applied to cryptocurrency trading.

#- **Assets Used**: BTC, ETH, BNB, XRP
#- **Time Period**: 2018 to 2024, with rolling windows
#- **Agents Used**: PPO, A2C, DDPG (Stable-Baselines3)
#- **Strategy**:
#  - Train each agent on a rolling window (e.g. 6 months)
#   - Validate on 2 months of unseen data
#   - Select the model with the highest Sharpe ratio
#   - Trade using the best model for 2 months
# - **Objective**: Observe the effectiveness of the agents using the ensemble strategy.


In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import warnings


warnings.filterwarnings("ignore")

sys.path.append("..")

# FinRL modules
# from finrl.config_tickers import DOW_30_TICKER
# from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.binancedownloader import BinanceDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_crypto_trading.env_cryptotrading import CryptoTradingEnv
from finrl.agents.stablebaselines3.models import DRLAgent
# from finrl.agents.stablebaselines3.models import DRLEnsembleAgent
from finrl.plot.plot import backtest_plot, get_daily_return, get_baseline
from finrl.plot.plot import backtest_stats_qs as backtest_stats  # using QuantStats


In [None]:
from finrl.config import (TRAIN_START_DATE, TRAIN_END_DATE, TEST_START_DATE, TEST_END_DATE, TRADE_START_DATE, TRADE_END_DATE)

START_DATE = TRAIN_START_DATE 
END_DATE =  TRADE_END_DATE

TRAIN_WINDOW_MONTHS = 6
VALIDATION_WINDOW_MONTHS = 2
TRADE_WINDOW_MONTHS = 2

# data_path = "../data/binance_raw.csv"
data_path = "../data/binance_less_raw.csv"

# if os.path.exists(data_path):
#     print(" Loading Binance data from local cache...")
#     df_raw = pd.read_csv(data_path, parse_dates=["date"])
#     print("Data saved to:", data_path)
# else:
#     print(" Downloading fresh Binance data...")
#     # tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT"]
#     tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "XRPUSDT"]
#     bd = BinanceDownloader()
df_raw = bd.download_multiple(ticker_list=tickers, start_str="1 Jan, 2012")
df_raw.to_csv(data_path, index=False)
print("Data retrieved from:", data_path)

print("Done...")


 Loading Binance data from local cache...
Data saved to: ../data/binance_less_raw.csv
Done...


In [3]:
df_raw.groupby("tic")["date"].min()

tic
BNB-USD   2017-11-06
BTC-USD   2017-08-17
ETH-USD   2017-08-17
XRP-USD   2018-05-04
Name: date, dtype: datetime64[ns]

In [4]:

# df_raw = YahooDownloader(start_date=TRAIN_START_DATE,
#                          end_date=TRADE_END_DATE,
#                          ticker_list=ticker_list).fetch_data()

fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list=["macd", "rsi_30", "cci_30"], 
                     use_vix=False,
                     use_turbulence=True
                     )

# user_defined_feature=False

df_processed = fe.preprocess_data(df_raw)  # D: removed processing from original



Successfully added technical indicators
Successfully added turbulence index


In [5]:
from dateutil.relativedelta import relativedelta
from finrl.utils.rolling_windows import get_rolling_windows

# PPO: Good for stable exploration with large batches
# A2C: Low n_steps means quick updates (crypto works well with this)
# DDPG: Buffer and batch sizes are fine; could experiment with larger buffer_size if needed

A2C_model_kwargs = {
    'n_steps': 5,
    'ent_coef': 0.005,
    'learning_rate': 0.0007
}

PPO_model_kwargs = {
    'ent_coef': 0.01,
    'n_steps': 2048,
    'learning_rate': 0.00025,
    'batch_size': 128
}

DDPG_model_kwargs = {
    'buffer_size': 10000,
    'learning_rate': 0.0005,
    'batch_size': 64
}

TRAIN_WINDOW_MONTHS = 6
VALIDATION_WINDOW_MONTHS = 2
TRADE_WINDOW_MONTHS = 2

# windows = get_rolling_windows(
#     train_months=TRAIN_WINDOW_MONTHS,
#     val_months=VALIDATION_WINDOW_MONTHS,
#     trade_months=TRADE_WINDOW_MONTHS
# )

windows = get_rolling_windows(
    train_months=TRAIN_WINDOW_MONTHS,
    val_months=VALIDATION_WINDOW_MONTHS,
    trade_months=TRADE_WINDOW_MONTHS,
    start_date_str=TRAIN_START_DATE,
    end_date_str=TRADE_END_DATE
)


print(f"Generated {len(windows)} rolling windows from {TRAIN_START_DATE} to {TRADE_END_DATE}.")



 Created 17 rolling windows.
Generated 17 rolling windows from 2020-01-06 to 2024-12-26.


In [None]:
from stable_baselines3 import PPO, A2C, DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from finrl.plot.plot import get_daily_return  

#  rolling window backtest using three reinforcement learning agents (PPO, A2C, DDPG). 
account_values_dict = {} # account_values 
EPISODES = 10
performance_log = []

for i, (train_start, train_end, val_start, val_end, trade_start, trade_end) in enumerate(windows):

    window_name = f"window_{i+1}"
    print(f" Rolling {window_name}: {train_start.date()} to {trade_end.date()}")
    
    train_data = data_split(df_processed, train_start, train_end)
    val_data = data_split(df_processed, val_start, val_end)
    trade_data = data_split(df_processed, trade_start, trade_end)

    min_days_required = 30

    if len(train_data["date"].unique()) < min_days_required:
        print(f"  Skipping Window {i+1} — Train window too short: {len(train_data['date'].unique())} days")
        continue
    
    if len(val_data["date"].unique()) < min_days_required:
        print(f"  Skipping Window {i+1} — Validation window too short: {len(val_data['date'].unique())} days")
        continue
        
    val_returns = val_data.groupby("date")["close"].mean().pct_change().dropna()
    if val_returns.std() == 0 or val_returns.empty:
        print(f"  Skipping Window {i+1} — No price volatility in validation window.")
        continue


    env_train = DummyVecEnv([lambda: CryptoTradingEnv(train_data)])
    agent = DRLAgent(env=env_train)

    models = {
        "ppo": agent.train_PPO(total_timesteps=len(train_data)*30, model_kwargs=PPO_model_kwargs),
        "a2c": agent.train_A2C(total_timesteps=len(train_data)*30, model_kwargs=A2C_model_kwargs),
        "ddpg": agent.train_DDPG(total_timesteps=int(len(train_data)*30*0.5), model_kwargs=DDPG_model_kwargs)
    }

    best_model = None
    best_sharpe = -np.inf
    val_sharpes = []

    for name, model in models.items():
        env_val = DummyVecEnv([lambda: CryptoTradingEnv(val_data)])

        sharpe_metrics = DRLAgent.DRL_prediction(model=model, environment=env_val, evaluate=True)
        sharpe = sharpe_metrics["sharpe"]
        val_sharpes.append((name, sharpe))

        # Debug: print account value
        account_vals = env_val.envs[0].asset_memory
        if not np.isnan(sharpe) and sharpe > best_sharpe:
            best_model = model
            best_sharpe = sharpe

    env_trade = DummyVecEnv([lambda: CryptoTradingEnv(trade_data)])
    val_sharpe_dict = {name: sharpe for name, sharpe in val_sharpes}

    if best_model is not None:
        startdt = datetime.strptime(trade_start.date(), "%Y-%m-%d")
        df_result = DRLAgent.DRL_prediction(model=best_model, environment=env_trade, start_date=startdt)
        account_values_series = df_result["account_value"].reset_index(drop=True)
        account_values_dict[window_name] = account_values_series

        print(f"✅ Best model: {best_model.__class__.__name__} with Sharpe {best_sharpe:.4f}")
        account_values = df_result["account_value"]
      
    # get_daily_return: Creates daily returns using pct_change(1) on the account_value column
    # Using Time zone localization to UTC — though only needed for other then daily returns
    # @ Returns a pandas Series of daily returns, indexed by date for further analysis.
        daily_returns = get_daily_return(df_result)  # get_daily_return expects a full df with date
        
        sharpe = (365**0.5) * daily_returns.mean() / daily_returns.std()
        
        total_return = account_values.iloc[-1] / account_values.iloc[0] - 1
        volatility = daily_returns.std()
        max_drawdown = (account_values.cummax() - account_values).max() / account_values.cummax().max()

        performance_log.append({
                "agent": best_model.__class__.__name__,
                "window": i + 1,
                "train_start": train_start.date(),
                "train_end": train_end.date(),
                "val_start": val_start.date(),
                "val_end": val_end.date(),
                "trade_start": trade_start.date(),
                "trade_end": trade_end.date(),
                "sharpe_ratio": sharpe,
                "min_account_value": account_values.min(),
                "max_account_value": account_values.max(),
                "total_return": total_return,
                "volatility": volatility,
                "max_drawdown": max_drawdown,
                "initial_acc_val": account_values.iloc[0],
                "final_acc_val": account_values.iloc[-1],
                **val_sharpe_dict
            })
    else:
        print(f"❌ No valid model selected in window {i+1} — skipping trade step.")


if performance_log:
    df_metrics = pd.DataFrame(performance_log)
    df_metrics.to_csv("../results/crypto_metrics.csv", index=False)
    print("Metrics saved to ../results/crypto_metrics.csv")
    display(df_metrics)

    df_account_values_all = pd.DataFrame(account_values_dict)
    df_account_values_all.to_csv("../results/rolling_account_values.csv", index=False)
    print("📁 Account values saved to: ../results/rolling_account_values.csv")
    
else:
    print("No results to analyze.")
    print(val_sharpe_dict)




 Rolling window_1: 2020-01-06 to 2020-11-06
 Window 1
  Train window: 2020-01-06 to 2020-07-06 — 182 days
  Val window  : 2020-07-07 to 2020-09-06   — 61 days
  Trade window: 2020-09-07 to 2020-11-06 — 60 days
count    60.000000
mean      0.002093
std       0.028938
min      -0.110398
25%      -0.006764
50%       0.003292
75%       0.017492
max       0.108308
Name: close, dtype: float64
Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1310 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1188         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0023009672 |
|    clip_fraction        | 0.014      

Unnamed: 0,agent,window,train_start,train_end,val_start,val_end,trade_start,trade_end,sharpe_ratio,min_account_value,max_account_value,total_return,volatility,max_drawdown,initial_acc_val,final_acc_val,ppo,a2c,ddpg
0,PPO,1,2020-01-06,2020-07-06,2020-07-07,2020-09-06,2020-09-07,2020-11-06,0.292348,973895.471588,1281342.0,0.0,0.033176,0.219568,1000000.0,1000000.0,0.181776,0.061938,
1,A2C,2,2020-04-06,2020-10-06,2020-10-07,2020-12-06,2020-12-07,2021-02-06,0.996566,941121.09212,2113528.0,0.045943,0.08041,0.526857,956075.3,1000000.0,0.196316,0.595186,
2,DDPG,3,2020-07-06,2021-01-06,2021-01-07,2021-03-06,2021-03-07,2021-05-06,0.127618,974403.366587,1249676.0,-0.027702,0.040171,0.220275,1028491.0,1000000.0,0.214245,0.294001,0.334373
3,DDPG,4,2020-10-06,2021-04-06,2021-04-07,2021-06-06,2021-06-07,2021-08-06,0.443704,701737.651372,1109794.0,0.008581,0.049762,0.280717,991492.1,1000000.0,0.067216,0.459945,0.649386
4,PPO,5,2021-01-06,2021-07-06,2021-07-07,2021-09-06,2021-09-07,2021-11-06,0.297967,857807.767641,1222314.0,0.0,0.036967,0.18188,1000000.0,1000000.0,0.676045,0.58978,
5,DDPG,6,2021-04-06,2021-10-06,2021-10-07,2021-12-06,2021-12-07,2022-02-06,0.423419,557201.44998,1013008.0,-0.012841,0.070091,0.449954,1013008.0,1000000.0,0.204136,0.344792,0.367173
6,DDPG,7,2021-07-06,2022-01-06,2022-01-07,2022-03-06,2022-03-07,2022-05-06,0.176861,967539.96302,1353944.0,-0.008389,0.031567,0.261417,1008460.0,1000000.0,0.359224,0.275819,0.457442
7,PPO,8,2021-10-06,2022-04-06,2022-04-07,2022-06-06,2022-06-07,2022-08-06,0.526776,860060.895047,1000000.0,0.030458,0.023295,0.110382,970442.5,1000000.0,0.530598,0.523714,
8,A2C,9,2022-01-06,2022-07-06,2022-07-07,2022-09-06,2022-09-07,2022-11-06,0.00515,1000000.0,1003567.0,0.0,0.00066,0.003554,1000000.0,1000000.0,0.178617,2.038686,0.52861
9,DDPG,10,2022-04-06,2022-10-06,2022-10-07,2022-12-06,2022-12-07,2023-02-06,0.215692,976432.823824,1407335.0,-0.022254,0.040292,0.289437,1022761.0,1000000.0,0.245248,0.000501,0.331988


📁 Account values saved to: ../results/rolling_account_values.csv


## from finrl.plot import get_daily_return

account_values = df_result["account_value"]
daily_returns = get_daily_return(account_values)

sharpe = (252**0.5) * daily_returns.mean() / daily_returns.std()
total_return = account_values.iloc[-1] / account_values.iloc[0] - 1
volatility = daily_returns.std()
max_drawdown = (account_values.cummax() - account_values).max() / account_values.cummax().max()


df_final = pd.concat(results)
backtest_stats(df_final)
backtest_plot(df_final)

## 

## 